In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score



In [2]:
with open('data_features.pickle', 'rb') as handle:
    data_features = pickle.load(handle)

In [3]:
train_data = data_features["train_transform"]
test_data = data_features["test_transform"]

In [4]:
train_data.head()

Unnamed: 0,target,final_text,features
0,1,"[last, night, finish, watch, jane, eyr, 1983, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ..."
1,1,"[mayb, sap, sweetest, movi, ever, saw, first, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,0,"[keep, disney, well, known, practic, steal, me...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,1,"[john, water, given, us, genuin, enjoy, film, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,0,"[start, write, review, break, watch, movi, fir...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, ..."


In [5]:
test_data.head()

Unnamed: 0,target,final_text,features
0,1,"[late, sydney, pollack, come, grown, love, sto...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,1,"[fairli, interest, look, charact, india, burge...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1,"[opportun, see, last, even, local, film, festi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,1,"[anoth, raquel, welch, classic, pictur, hit, t...","[0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ..."
4,0,"[rocketship, x, view, seriou, movi, buff, foll...","[1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ..."


In [6]:
train_raw = pd.DataFrame(list(train_data["features"])) 
train_raw["target"] = train_data["target"]

test_raw = pd.DataFrame(list(test_data["features"])) 
test_raw["target"] = test_data["target"]


In [7]:
print("train_raw : ", train_raw.shape)
train_raw.head()

train_raw :  (25000, 501)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,1
1,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0


In [8]:
print("test_raw : ", test_raw.shape)
test_raw.head()

test_raw :  (25000, 501)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0,3,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X_train = train_raw.loc[:, train_raw.columns != 'target']
Y_train = train_raw.loc[:, train_raw.columns == 'target']

X_test = test_raw.loc[:, test_raw.columns != 'target']
Y_test = test_raw.loc[:, test_raw.columns == 'target']

In [10]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,3,0,0,0
1,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,2,1,0,0,0,0,0,1,0,0


In [11]:
from sklearn.ensemble import RandomForestClassifier


In [12]:
selected_model = {
    "LinearRegression" : LinearRegression(),
#     "Ridge" : Ridge(),
#     "Lasso" : Lasso(),
    "XGBRegressor" : XGBRegressor(),
#     "SVM" : SVR(),
    "LogisticRegression" : LogisticRegression(),
    "DecisionTreeClassifier" : DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier()
#     "KNeighborsClassifier" : KNeighborsClassifier(),
#     "LinearDiscriminantAnalysis" : LinearDiscriminantAnalysis(),
#     "GaussianNB" : GaussianNB()
    
    
}

In [13]:
def model_evaluate() :
    
    for model_name,model_value in selected_model.items() :
        print("running model : ", model_name)
        model = model_value
        model.fit(X_train,Y_train)
        target_predicted = model.predict(X_test)
        # print(model_name, " target_predicted : ", target_predicted)
        acc = accuracy_score(Y_test["target"], target_predicted.round())

        
        
        print(model_name," : ", acc)

In [14]:
model_evaluate()

running model :  DecisionTreeClassifier
DecisionTreeClassifier  target_predicted :  [1 1 1 ... 0 1 0]
DecisionTreeClassifier  :  0.70848
running model :  LinearRegression
LinearRegression  target_predicted :  [[0.78238557]
 [0.74137366]
 [0.57575769]
 ...
 [0.61403754]
 [0.39329622]
 [0.81082792]]
LinearRegression  :  0.8352
running model :  XGBRegressor
XGBRegressor  target_predicted :  [0.94925076 0.66787237 0.62991154 ... 0.561541   0.60205287 0.22852758]
XGBRegressor  :  0.82592
running model :  LogisticRegression


  y = column_or_1d(y, warn=True)


LogisticRegression  target_predicted :  [1 1 1 ... 1 0 1]
LogisticRegression  :  0.84948
running model :  RandomForestClassifier


  


RandomForestClassifier  target_predicted :  [1 1 1 ... 1 0 1]
RandomForestClassifier  :  0.82076


In [50]:
model = LogisticRegression(n_jobs=5,max_iter=200)
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=5, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
model.fit(X_train,Y_train)
Y_test_predicted = model.predict(X_test)

  y = column_or_1d(y, warn=True)


In [52]:
Y_test["target_predicted"] = Y_test_predicted.round()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [53]:
Y_test.head()

Unnamed: 0,target,target_predicted
0,1,1
1,1,1
2,1,1
3,1,1
4,0,1


In [54]:
acc = accuracy_score(Y_test["target"], Y_test["target_predicted"].round())
precision = precision_score(Y_test["target"], Y_test["target_predicted"],average="weighted")
recall = recall_score(Y_test["target"], Y_test["target_predicted"],average="weighted")
f1 = f1_score(Y_test["target"], Y_test["target_predicted"],average="weighted")
roc = roc_auc_score(Y_test["target"], Y_test["target_predicted"].round())

print("acc : ", acc)
print("precision : ", precision)
print("recall : ", recall)
print("f1 : ", f1)
print("roc : ", roc)

acc :  0.84948
precision :  0.8496747548514619
recall :  0.84948
f1 :  0.8494590386765453
roc :  0.84948
