In [1]:
# imports and settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
from funs import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


pd.set_option('display.max_rows', 5)
%matplotlib inline

In [233]:
train_file_path = r'./train.csv'
df = pd.read_csv(train_file_path, lineterminator='\n')

In [234]:
df

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,Negative
1,2,ya Allah meri sister Affia ki madad farma,Positive
...,...,...,...
6326,6327,Ma na suna ha lemon sa haddiyan kamzor hoti hn...,Negative
6327,6328,Ball poar jadooi giraft se inhe rafter aur swi...,Positive


In [235]:
submit_df_file_path = r'./20190506_test.csv'
submit_df = pd.read_csv(submit_df_file_path, lineterminator='\n')

In [236]:
submit_df

Unnamed: 0,ID,review
0,1,Hum logo ny 70 salo ma itna loan ni lia jitna ...
1,2,Us dor ke mushahir ke sath us ke gehre taluqat...
...,...,...
2710,2711,Sindh Bhar Me CNG Stations Mangal Jummerat Or ...
2711,2712,Ye kr kia rhy hain pehly ghazian ko mar dia ab...


In [295]:
X = df['review'].tolist()
X.extend(submit_df['review'].tolist())
len(X)

9040

In [315]:
train_size = 6000
# train_size = None

In [316]:
# no data cleaning yet

X_train, X_test, y_train, y_test = train_test_split(df['review'], df['label'] == 'Positive', train_size=train_size)



In [304]:
# vectorizer = CountVectorizer(analyzer='word', max_features=5000, ngram_range=(1, 2))
max_features=4600
vectorizer = TfidfVectorizer(max_features=max_features, min_df=5, ngram_range=(1, 5))
vectorizer.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=4600, min_df=5,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [317]:
X_train = vectorizer.transform(X_train)
# X_train = X_train.toarray()

X_test = vectorizer.transform(X_test)
# X_test = X_test.toarray()

In [318]:
X_train.shape

(6000, 4600)

In [307]:
def solve(clf):
    train_predictions = lr.predict_proba(X_train)[:, 1]
    test_predictions = lr.predict_proba(X_test)[:, 1]

    train_auc = roc_auc_score(y_train, train_predictions)
    test_auc = roc_auc_score(y_test, test_predictions)
    
    return train_auc, test_auc

In [319]:
max_score = 0
best_c = -1
for c in np.arange(0.80, 4.51, 0.05):
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    train_auc, test_auc=  solve(lr)
    if test_auc > max_score:
        max_score = test_auc
        best_c = c
    if test_auc >= 0.829:
        print ('C: %.3f' %  (c))
        print ('trian auc: %.3f%%, test auc: %.3f%%' %  (train_auc * 100, test_auc * 100))

assert (c >= 0), 'c being negative'



C: 0.800
trian auc: 92.996%, test auc: 84.136%
C: 0.850
trian auc: 93.202%, test auc: 84.222%
C: 0.900
trian auc: 93.394%, test auc: 84.293%
C: 0.950
trian auc: 93.570%, test auc: 84.353%
C: 1.000
trian auc: 93.737%, test auc: 84.409%
C: 1.050
trian auc: 93.895%, test auc: 84.468%
C: 1.100
trian auc: 94.045%, test auc: 84.468%
C: 1.150
trian auc: 94.185%, test auc: 84.495%
C: 1.200




trian auc: 94.319%, test auc: 84.498%
C: 1.250
trian auc: 94.446%, test auc: 84.521%
C: 1.300
trian auc: 94.564%, test auc: 84.588%
C: 1.350
trian auc: 94.676%, test auc: 84.637%
C: 1.400
trian auc: 94.783%, test auc: 84.637%
C: 1.450
trian auc: 94.886%, test auc: 84.667%
C: 1.500
trian auc: 94.985%, test auc: 84.700%
C: 1.550
trian auc: 95.082%, test auc: 84.670%




C: 1.600
trian auc: 95.171%, test auc: 84.652%
C: 1.650
trian auc: 95.258%, test auc: 84.682%
C: 1.700
trian auc: 95.340%, test auc: 84.708%
C: 1.750
trian auc: 95.421%, test auc: 84.663%
C: 1.800
trian auc: 95.498%, test auc: 84.685%
C: 1.850
trian auc: 95.572%, test auc: 84.700%
C: 1.900
trian auc: 95.643%, test auc: 84.700%
C: 1.950
trian auc: 95.713%, test auc: 84.689%
C: 2.000




trian auc: 95.778%, test auc: 84.704%
C: 2.050
trian auc: 95.843%, test auc: 84.689%
C: 2.100
trian auc: 95.905%, test auc: 84.708%
C: 2.150
trian auc: 95.966%, test auc: 84.704%
C: 2.200
trian auc: 96.023%, test auc: 84.726%
C: 2.250
trian auc: 96.079%, test auc: 84.730%
C: 2.300
trian auc: 96.136%, test auc: 84.741%




C: 2.350
trian auc: 96.189%, test auc: 84.749%
C: 2.400
trian auc: 96.240%, test auc: 84.756%
C: 2.450
trian auc: 96.290%, test auc: 84.741%
C: 2.500
trian auc: 96.339%, test auc: 84.745%
C: 2.550
trian auc: 96.384%, test auc: 84.745%
C: 2.600
trian auc: 96.431%, test auc: 84.749%
C: 2.650
trian auc: 96.474%, test auc: 84.756%
C: 2.700
trian auc: 96.518%, test auc: 84.719%
C: 2.750




trian auc: 96.560%, test auc: 84.715%
C: 2.800
trian auc: 96.602%, test auc: 84.719%
C: 2.850
trian auc: 96.640%, test auc: 84.708%
C: 2.900
trian auc: 96.679%, test auc: 84.730%
C: 2.950
trian auc: 96.717%, test auc: 84.753%
C: 3.000
trian auc: 96.755%, test auc: 84.749%
C: 3.050
trian auc: 96.791%, test auc: 84.753%
C: 3.100
trian auc: 96.826%, test auc: 84.753%
C: 3.150
trian auc: 96.860%, test auc: 84.734%
C: 3.200
trian auc: 96.894%, test auc: 84.730%
C: 3.250
trian auc: 96.927%, test auc: 84.734%
C: 3.300
trian auc: 96.961%, test auc: 84.726%
C: 3.350
trian auc: 96.993%, test auc: 84.741%
C: 3.400
trian auc: 97.025%, test auc: 84.715%
C: 3.450
trian auc: 97.056%, test auc: 84.719%
C: 3.500




trian auc: 97.086%, test auc: 84.700%
C: 3.550
trian auc: 97.114%, test auc: 84.685%
C: 3.600
trian auc: 97.141%, test auc: 84.689%
C: 3.650
trian auc: 97.168%, test auc: 84.696%
C: 3.700
trian auc: 97.196%, test auc: 84.715%
C: 3.750
trian auc: 97.222%, test auc: 84.715%
C: 3.800
trian auc: 97.248%, test auc: 84.708%
C: 3.850
trian auc: 97.274%, test auc: 84.711%




C: 3.900
trian auc: 97.299%, test auc: 84.711%
C: 3.950
trian auc: 97.324%, test auc: 84.726%
C: 4.000
trian auc: 97.349%, test auc: 84.711%
C: 4.050
trian auc: 97.372%, test auc: 84.715%
C: 4.100
trian auc: 97.397%, test auc: 84.723%
C: 4.150
trian auc: 97.420%, test auc: 84.696%




C: 4.200
trian auc: 97.442%, test auc: 84.719%
C: 4.250
trian auc: 97.465%, test auc: 84.704%
C: 4.300
trian auc: 97.487%, test auc: 84.700%
C: 4.350
trian auc: 97.508%, test auc: 84.704%
C: 4.400
trian auc: 97.529%, test auc: 84.674%
C: 4.450
trian auc: 97.550%, test auc: 84.652%
C: 4.500




trian auc: 97.571%, test auc: 84.633%


In [312]:
best_c

1.6000000000000008

In [314]:
lr = LogisticRegression(C=best_c)
lr.fit(X_train, y_train)
train_auc, test_auc=  solve(lr)
print ('trian auc: %.3f%%, test auc: %.3f%%' %  (train_auc * 100, test_auc * 100))

trian auc: 95.621%, test auc: 84.786%




In [321]:
X = submit_df['review']
X

0       Hum logo ny 70 salo ma itna loan ni lia jitna ...
1       Us dor ke mushahir ke sath us ke gehre taluqat...
                              ...                        
2710    Sindh Bhar Me CNG Stations Mangal Jummerat Or ...
2711    Ye kr kia rhy hain pehly ghazian ko mar dia ab...
Name: review, Length: 2712, dtype: object

In [322]:
X = vectorizer.transform(X)
X.shape

(2712, 4600)

In [325]:
ans = lr.predict_proba(X)[:, 1]

In [326]:
ans.shape

(2712,)

In [328]:
output = pd.DataFrame(data={'ID': submit_df['ID'], 'Pred': ans})
output

Unnamed: 0,ID,Pred
0,1,0.284833
1,2,0.838462
...,...,...
2710,2711,0.098454
2711,2712,0.062837


In [330]:
output.to_csv('submit.csv', index=False)