In [None]:
import pandas as pd
import numpy as np

### Importing the Training Dataset

In [None]:
df=pd.read_table("/content/1643662645_8986752_1567602457_1187546_train_file.dat",header=None)

In [None]:
colnames = ['Target','Review']
df=pd.read_table("/content/1643662645_8986752_1567602457_1187546_train_file.dat",names=colnames,header=None)



### Dropping NaN values

In [None]:
df.dropna(inplace=True)

In [None]:
df

Unnamed: 0,Target,Review
0,1,This book is such a life saver. It has been s...
1,1,I bought this a few times for my older son and...
2,1,"This is great for basics, but I wish the space..."
3,1,This book is perfect! I'm a first time new mo...
4,1,During your postpartum stay at the hospital th...
...,...,...
18501,-1,"I really liked this monitor at first, but the ..."
18502,-1,Apparently you get what you pay for. I've use...
18503,-1,The old saying holds true with this product --...
18504,-1,We did a great deal of research before purchas...


### Removing Special Characters

In [None]:
df['Review1'] = df['Review'].fillna('').astype(str).str.replace(r'[^A-Za-z ]', '', regex=True).replace('', np.nan, regex=False)


In [None]:
df

Unnamed: 0,Target,Review,Review1
0,1,This book is such a life saver. It has been s...,This book is such a life saver It has been so...
1,1,I bought this a few times for my older son and...,I bought this a few times for my older son and...
2,1,"This is great for basics, but I wish the space...",This is great for basics but I wish the space ...
3,1,This book is perfect! I'm a first time new mo...,This book is perfect Im a first time new mom ...
4,1,During your postpartum stay at the hospital th...,During your postpartum stay at the hospital th...
...,...,...,...
18501,-1,"I really liked this monitor at first, but the ...",I really liked this monitor at first but the s...
18502,-1,Apparently you get what you pay for. I've use...,Apparently you get what you pay for Ive used ...
18503,-1,The old saying holds true with this product --...,The old saying holds true with this product y...
18504,-1,We did a great deal of research before purchas...,We did a great deal of research before purchas...


In [None]:
df.Review1[3]

'This book is perfect  Im a first time new mom and this book made it so easy to keep track of feedings diaper changes sleep  Definitely would recommend this for new moms  Plus its small enough that I throw in the diaper back for doctor visits'

### Converting the text to lowercase

In [None]:
df['Review1'] = df['Review1'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df

Unnamed: 0,Target,Review,Review1
0,1,This book is such a life saver. It has been s...,this book is such a life saver it has been so ...
1,1,I bought this a few times for my older son and...,i bought this a few times for my older son and...
2,1,"This is great for basics, but I wish the space...",this is great for basics but i wish the space ...
3,1,This book is perfect! I'm a first time new mo...,this book is perfect im a first time new mom a...
4,1,During your postpartum stay at the hospital th...,during your postpartum stay at the hospital th...
...,...,...,...
18501,-1,"I really liked this monitor at first, but the ...",i really liked this monitor at first but the s...
18502,-1,Apparently you get what you pay for. I've use...,apparently you get what you pay for ive used p...
18503,-1,The old saying holds true with this product --...,the old saying holds true with this product yo...
18504,-1,We did a great deal of research before purchas...,we did a great deal of research before purchas...


### Removing Stop-words

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')
df['Review1'] = df['Review1'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Target,Review,Review1
0,1,This book is such a life saver. It has been s...,book life saver helpful able go back track tre...
1,1,I bought this a few times for my older son and...,bought times older son bought newborn super ea...
2,1,"This is great for basics, but I wish the space...",great basics wish space write things bigger lo...
3,1,This book is perfect! I'm a first time new mo...,book perfect im first time new mom book made e...
4,1,During your postpartum stay at the hospital th...,postpartum stay hospital nurses ask keep log b...
...,...,...,...
18501,-1,"I really liked this monitor at first, but the ...",really liked monitor first second night use de...
18502,-1,Apparently you get what you pay for. I've use...,apparently get pay ive used philips audio moni...
18503,-1,The old saying holds true with this product --...,old saying holds true product get pay like led...
18504,-1,We did a great deal of research before purchas...,great deal research purchasing item loved smal...


### Stemming the words

In [None]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
df['Review1'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0        book life saver help abl go back track trend a...
1        bought time older son bought newborn super eas...
2        great basic wish space write thing bigger lot ...
3        book perfect im first time new mom book made e...
4        postpartum stay hospit nurs ask keep log babi ...
                               ...                        
18501    realli like monitor first second night use dec...
18502    appar get pay ive use philip audio monitor las...
18503    old say hold true product get pay like led lig...
18504    great deal research purchas item love small si...
18505    order great success two packag liner discov mu...
Name: Review1, Length: 18497, dtype: object

### Tokenizing 

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df['tokenized'] = df.apply(lambda row: nltk.word_tokenize(row['Review1']), axis=1)
df.head()

Unnamed: 0,Target,Review,Review1,tokenized
0,1,This book is such a life saver. It has been s...,book life saver helpful able go back track tre...,"[book, life, saver, helpful, able, go, back, t..."
1,1,I bought this a few times for my older son and...,bought times older son bought newborn super ea...,"[bought, times, older, son, bought, newborn, s..."
2,1,"This is great for basics, but I wish the space...",great basics wish space write things bigger lo...,"[great, basics, wish, space, write, things, bi..."
3,1,This book is perfect! I'm a first time new mo...,book perfect im first time new mom book made e...,"[book, perfect, im, first, time, new, mom, boo..."
4,1,During your postpartum stay at the hospital th...,postpartum stay hospital nurses ask keep log b...,"[postpartum, stay, hospital, nurses, ask, keep..."


### Lemmatizing

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lmtzr = WordNetLemmatizer()
df['tokenized'] = df['tokenized'].apply(lambda lst:[lmtzr.lemmatize(word) for word in lst])

print(df.tokenized)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
0        [book, life, saver, helpful, able, go, back, t...
1        [bought, time, older, son, bought, newborn, su...
2        [great, basic, wish, space, write, thing, bigg...
3        [book, perfect, im, first, time, new, mom, boo...
4        [postpartum, stay, hospital, nurse, ask, keep,...
                               ...                        
18501    [really, liked, monitor, first, second, night,...
18502    [apparently, get, pay, ive, used, philip, audi...
18503    [old, saying, hold, true, product, get, pay, l...
18504    [great, deal, research, purchasing, item, love...
18505    [ordered, great, success, two, package, liner,...
Name: tokenized, Length: 18497, dtype: object


### Applying n-grams and TF-IDF 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams

In [None]:
X = df.tokenized.astype(str)

vect = TfidfVectorizer(ngram_range=(1,2)).fit(X)
df_vectorized= vect.transform(X)

In [None]:
print(df_vectorized)

  (0, 483321)	0.13881856686528127
  (0, 483086)	0.05838612068711072
  (0, 469236)	0.13881856686528127
  (0, 469164)	0.13881856686528127
  (0, 469090)	0.14203480430991539
  (0, 453728)	0.1237668782833404
  (0, 453398)	0.05852206008955268
  (0, 448875)	0.13881856686528127
  (0, 448874)	0.10171655306870465
  (0, 446657)	0.13881856686528127
  (0, 446602)	0.09010803233555839
  (0, 444471)	0.13881856686528127
  (0, 444394)	0.07716830631892549
  (0, 438447)	0.11121311123654294
  (0, 437610)	0.03448750243501236
  (0, 433981)	0.13881856686528127
  (0, 433901)	0.0813142249938548
  (0, 433278)	0.10535685579160384
  (0, 432803)	0.04611742777803885
  (0, 432162)	0.129322008697443
  (0, 431846)	0.129322008697443
  (0, 431538)	0.0825611284046913
  (0, 367663)	0.11676824165064553
  (0, 367026)	0.04893795761506844
  (0, 366032)	0.129322008697443
  :	:
  (18496, 168083)	0.033815008000805495
  (18496, 152526)	0.12022691853696998
  (18496, 152207)	0.04387094145265607
  (18496, 136454)	0.14156870883267872


In [None]:
df_vectorized.shape

(18497, 499721)

### Splitting the Training data into Train and Test data with 70:30 ratio

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df['tokenized'],df['Target'],test_size=0.15,random_state=1)


In [None]:
print(X_train)

4273     [got, two, set, doubler, one, set, work, great...
11170    [item, wonderful, idea, two, key, flaw, isnt, ...
16957    [son, turning, month, old, started, teething, ...
13813    [receiving, gift, contacted, company, produce,...
15693    [product, really, doesnt, seem, safe, day, old...
                               ...                        
10960    [ordered, month, old, use, time, keep, one, di...
17296    [nice, idea, cute, far, able, get, stick, anyt...
5194     [mustela, dermo, cleansing, u, fl, oz, bottle,...
12177    [soon, got, tried, pack, n, play, mat, fit, pe...
235      [lock, fantastic, wow, pain, install, directio...
Name: tokenized, Length: 15722, dtype: object


In [None]:
print(y_train)

4273     1
11170   -1
16957   -1
13813   -1
15693   -1
        ..
10960    1
17296   -1
5194     1
12177   -1
235      1
Name: Target, Length: 15722, dtype: int64


### Vectorizing the Test data that we split

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
X_t = X_train.astype(str)

vect = TfidfVectorizer(ngram_range=(1,2)).fit(X_t)
X_train_vectorized= vect.transform(X_t)

In [None]:
print(X_train_vectorized)

  (0, 434389)	0.10734648483610383
  (0, 434148)	0.07317109640901197
  (0, 433524)	0.08623839628392165
  (0, 433196)	0.05004609324479266
  (0, 425687)	0.1592637008480044
  (0, 424986)	0.049167905565366474
  (0, 420083)	0.1821523505319235
  (0, 419959)	0.08369112678308469
  (0, 402092)	0.13459981749557806
  (0, 401573)	0.05965911329094782
  (0, 367913)	0.15682401086318676
  (0, 367912)	0.07878244048997027
  (0, 360246)	0.16208011624168556
  (0, 359419)	0.05481054603619511
  (0, 329115)	0.1821523505319235
  (0, 329050)	0.08742714294450223
  (0, 327835)	0.15467205178581603
  (0, 327812)	0.1821523505319235
  (0, 327374)	0.1821523505319235
  (0, 327205)	0.21453921819721425
  (0, 323063)	0.13600593689260193
  (0, 322771)	0.07328874353165298
  (0, 284818)	0.14795352331391082
  (0, 284746)	0.06723588194654186
  (0, 260202)	0.1821523505319235
  :	:
  (15721, 193571)	0.07547359881427977
  (15721, 184053)	0.1559640604847401
  (15721, 184011)	0.08360128577415649
  (15721, 183873)	0.1676177070216690

### Performing Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized,y_train)

LogisticRegression()

In [None]:
X_test_vect=X_test.astype(str)

predictions = model.predict(vect.transform(X_test_vect))

In [None]:
print(predictions)

[-1  1 -1 ...  1  1 -1]


### Calculating the ROC Score and Accuracy of our Trained Model

In [None]:
from sklearn.metrics import roc_auc_score
print("AUC:",roc_auc_score(y_test,predictions))

AUC: 0.876557496840316


In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy:",accuracy_score(y_test, predictions))

Accuracy: 0.8789189189189189


### Trying to improve our accuracy score with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
param = {'C': [0.1,1,10]}
grid = GridSearchCV(LogisticRegression(),param, cv=5,n_jobs=-1)
grid.fit(X_train_vectorized,y_train)

print(grid.best_params_)
print(grid.best_score_)

{'C': 10}
0.8799771235087805


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
y_pred = grid.predict(vect.transform(X_test_vect))
print(accuracy_score(y_test,y_pred))

0.8904504504504505


# TEST DATA

### Importing the given Test Data

In [None]:
test_data="/content/1643662645_9617953_1567602457_126649_test.dat"

f = open(test_data,'r')
test_data=f.readlines()
f.close()
print(len(test_data))

test_data=pd.DataFrame(test_data)

18506


In [None]:
test_data.rename(columns={0:'Review'},inplace=True)

In [None]:
test_data

Unnamed: 0,Review
0,Perfect for new parents. We were able to keep ...
1,Helps me know exactly how my babies day has go...
2,I wanted an alternative to printing out daily ...
3,My 3 month old son spend half of his days with...
4,The Baby Tracker brand books are the absolute ...
...,...
18501,"WTF. The pieces don't fit together, the instru..."
18502,I've gone through a couple of video baby monit...
18503,This monitor is cheap and doesn't work well. O...
18504,"These monitors do not work at all, I even atte..."


Checking for Null Values

In [None]:
test_data.isna().sum()

Review    0
dtype: int64

### Removing Punctuation from the dataset

In [None]:
test_data['Review'] = test_data['Review'].str.replace('[^\w\s]','')

  """Entry point for launching an IPython kernel.


In [None]:
test_data

Unnamed: 0,Review
0,Perfect for new parents We were able to keep t...
1,Helps me know exactly how my babies day has go...
2,I wanted an alternative to printing out daily ...
3,My 3 month old son spend half of his days with...
4,The Baby Tracker brand books are the absolute ...
...,...
18501,WTF The pieces dont fit together the instructi...
18502,Ive gone through a couple of video baby monito...
18503,This monitor is cheap and doesnt work well Ove...
18504,These monitors do not work at all I even attem...


In [None]:
test_data.Review[0]

'Perfect for new parents We were able to keep track of babys feeding sleep and diaper change schedule for the first two and a half months of her life Made life easier when the doctor would ask questions about habits because we had it all right there\n'

### Converting the text to lower-case

In [None]:
test_data['Review'] = test_data['Review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
test_data

Unnamed: 0,Review
0,perfect for new parents we were able to keep t...
1,helps me know exactly how my babies day has go...
2,i wanted an alternative to printing out daily ...
3,my 3 month old son spend half of his days with...
4,the baby tracker brand books are the absolute ...
...,...
18501,wtf the pieces dont fit together the instructi...
18502,ive gone through a couple of video baby monito...
18503,this monitor is cheap and doesnt work well ove...
18504,these monitors do not work at all i even attem...


### Removing Stopwords

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
test_data['Review'] = test_data['Review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test_data

Unnamed: 0,Review
0,perfect new parents able keep track babys feed...
1,helps know exactly babies day gone mother law ...
2,wanted alternative printing daily log sheets n...
3,3 month old son spend half days mother half ne...
4,baby tracker brand books absolute best tracker...
...,...
18501,wtf pieces dont fit together instructions look...
18502,ive gone couple video baby monitors compared o...
18503,monitor cheap doesnt work well half night stay...
18504,monitors work even attempted contact customer ...


### Stemming the Words

In [None]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
test_data['Review'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
test_data

Unnamed: 0,Review
0,perfect new parents able keep track babys feed...
1,helps know exactly babies day gone mother law ...
2,wanted alternative printing daily log sheets n...
3,3 month old son spend half days mother half ne...
4,baby tracker brand books absolute best tracker...
...,...
18501,wtf pieces dont fit together instructions look...
18502,ive gone couple video baby monitors compared o...
18503,monitor cheap doesnt work well half night stay...
18504,monitors work even attempted contact customer ...


### Tokenizing 

In [None]:
import pandas as pd
from nltk.tokenize import  word_tokenize
test_data['Review'] = df.apply(lambda row: nltk.word_tokenize(row['Review']), axis=1)
test_data.head()

Unnamed: 0,Review
0,"[This, book, is, such, a, life, saver, ., It, ..."
1,"[I, bought, this, a, few, times, for, my, olde..."
2,"[This, is, great, for, basics, ,, but, I, wish..."
3,"[This, book, is, perfect, !, I, 'm, a, first, ..."
4,"[During, your, postpartum, stay, at, the, hosp..."


### Applying n-grams and TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams

In [None]:
test_data_vectorized= vect.transform(test_data['Review'].astype('U').values)

In [None]:
print(test_data_vectorized)

  (0, 435961)	0.2773165866927987
  (0, 435914)	0.06711136998621481
  (0, 401812)	0.23265127841589373
  (0, 401573)	0.09467812673293353
  (0, 393998)	0.19194237910267123
  (0, 339347)	0.28907307495936124
  (0, 339258)	0.11355824788149041
  (0, 319653)	0.23712088781613927
  (0, 311735)	0.10039411703405808
  (0, 269742)	0.28907307495936124
  (0, 269480)	0.10879323732396336
  (0, 243240)	0.10934776483097387
  (0, 218028)	0.23065082543512475
  (0, 217701)	0.09945733733370303
  (0, 203474)	0.22230946426335765
  (0, 203436)	0.28624117230048246
  (0, 191932)	0.2208947901493311
  (0, 191249)	0.08941243594292307
  (0, 160901)	0.1361820956859802
  (0, 133778)	0.21055297599679504
  (0, 133107)	0.08561300267152544
  (0, 129143)	0.1322917632872818
  (0, 108808)	0.12182021707170916
  (0, 101903)	0.20079694071100143
  (0, 96316)	0.17652686978931548
  :	:
  (18505, 25601)	0.03493781532115209
  (18505, 25184)	0.03446167207801953
  (18505, 24887)	0.05523489241856402
  (18505, 24859)	0.05027255095191303
 

In [None]:
test_data_vectorized.shape

(18506, 441119)

In [None]:
predictions=model.predict(test_data_vectorized)

In [None]:
print(predictions)

[ 1  1  1 ... -1 -1 -1]


In [None]:
f = open("Simran584.dat", "a")
for i in predictions:
  f.write(str(i))
  f.write("\n")
f.close()

In [None]:
f = open("Simran584.dat","r")
for i in f:
    print(i)
f.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1

1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

1

1

1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

1

1

-1

1

1

-1

1

-1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

-1

1

1

-1

-1

1

1

1

-1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

-1

-1

-1

-1

1

-1

1

-1

-1

-1

-1

-1

1

1

-1

-1

1

-1

-1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

-1

-1

-1

-1

-1

-1

-1

1

-1

-1

-1

-1