In [1]:
import pandas as pd
import warnings

In [2]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier ,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score,accuracy_score, classification_report, f1_score ,confusion_matrix


In [3]:
train_data = pd.read_excel("C:\\Users\\SONY\\Desktop\\WORK\\PROJECT\\train_chunks.xlsx")  ##train data
test_data =  pd.read_excel("C:\\Users\\SONY\\Desktop\\WORK\\PROJECT\\test_chunks.xlsx")   ## test data


In [4]:
train_data.isna().any()

Text       True
Labels    False
Id        False
dtype: bool

In [5]:
train_data=train_data.dropna(axis=0)

In [6]:
test_data.isna().any()

ID        False
Text       True
Labels    False
dtype: bool

In [7]:
test_data=test_data.dropna(axis=0)

In [8]:
train = list(train_data['Text'])
test = list(test_data['Text'])
y_train = list(train_data['Labels'])
tst_id = pd.DataFrame(test_data['ID'])
y_test =list(test_data['Labels'])

In [9]:
vectorizer = TfidfVectorizer(min_df =1,stop_words='english',use_idf=True,analyzer='word',
                             ngram_range=(1,1),max_features=15000)
x_train = vectorizer.fit_transform(train)
x_test  = vectorizer.transform(test)

In [10]:
x_test.shape

(3198, 15000)

### LOGISTIC REGRESSION

In [11]:
logisticRegr = LogisticRegression(solver='liblinear',class_weight='balanced',random_state=5,tol=0.001,max_iter=1000)
logisticRegr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=5, solver='liblinear', tol=0.001, verbose=0,
                   warm_start=False)

In [12]:
predictions = logisticRegr.predict(x_test)

In [13]:
cm = confusion_matrix(y_test, predictions)
print(cm)


[[2731   57]
 [ 162  248]]


In [14]:
accuracy_score(y_test, predictions)

0.9315196998123827

In [15]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      2788
           1       0.81      0.60      0.69       410

    accuracy                           0.93      3198
   macro avg       0.88      0.79      0.83      3198
weighted avg       0.93      0.93      0.93      3198



### RANDOM FOREST

In [16]:
rand = RandomForestClassifier(n_estimators=100,criterion='entropy',max_features=None,class_weight='balanced')
rand.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=None, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [17]:
prediction2 = rand.predict(x_test)

In [18]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, prediction2))
print('\n','ACCURACY','\n',accuracy_score(y_test, prediction2))
print('\n','REPORT','\n',classification_report(prediction2,y_test))


 CONFUSION MATRIX 
 [[2562  226]
 [ 248  162]]

 ACCURACY 
 0.851782363977486

 REPORT 
               precision    recall  f1-score   support

           0       0.92      0.91      0.92      2810
           1       0.40      0.42      0.41       388

    accuracy                           0.85      3198
   macro avg       0.66      0.66      0.66      3198
weighted avg       0.86      0.85      0.85      3198



### ADA-BOOST

In [19]:
ada = AdaBoostClassifier(algorithm='SAMME.R',learning_rate=0.01)
ada.fit(x_train,y_train)  

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.01,
                   n_estimators=50, random_state=None)

In [20]:
predictions3 = ada.predict(x_test)

In [21]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, predictions3))
print('\n','ACCURACY','\n',accuracy_score(y_test, predictions3))
print('\n','REPORT','\n',classification_report(y_test,predictions3))


 CONFUSION MATRIX 
 [[2773   15]
 [ 320   90]]

 ACCURACY 
 0.8952470293933709

 REPORT 
               precision    recall  f1-score   support

           0       0.90      0.99      0.94      2788
           1       0.86      0.22      0.35       410

    accuracy                           0.90      3198
   macro avg       0.88      0.61      0.65      3198
weighted avg       0.89      0.90      0.87      3198



### NEURAL NETWORK SKLEARN

In [22]:
from sklearn.neural_network import MLPClassifier

In [23]:
MLP = MLPClassifier(alpha=0.0001,hidden_layer_sizes=(5,2), random_state=1,solver='adam',
                    activation='relu')


In [24]:
MLP.fit(x_train,y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [25]:
predictions4 = MLP.predict(x_test)

In [26]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, predictions4))
print('\n','ACCURACY','\n',accuracy_score(y_test, predictions4))
print('\n','REPORT','\n',classification_report(y_test,predictions4))


 CONFUSION MATRIX 
 [[2785    3]
 [ 381   29]]

 ACCURACY 
 0.8799249530956847

 REPORT 
               precision    recall  f1-score   support

           0       0.88      1.00      0.94      2788
           1       0.91      0.07      0.13       410

    accuracy                           0.88      3198
   macro avg       0.89      0.53      0.53      3198
weighted avg       0.88      0.88      0.83      3198



### KNN

In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
KN = KNeighborsClassifier( algorithm='brute')
KN.fit(x_train,y_train)
predictions5 = KN.predict(x_test)


In [29]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, predictions5))
print('\n','ACCURACY','\n',accuracy_score(y_test, predictions5))
print('\n','REPORT','\n',classification_report(y_test,predictions5))


 CONFUSION MATRIX 
 [[2491  297]
 [ 148  262]]

 ACCURACY 
 0.8608505315822389

 REPORT 
               precision    recall  f1-score   support

           0       0.94      0.89      0.92      2788
           1       0.47      0.64      0.54       410

    accuracy                           0.86      3198
   macro avg       0.71      0.77      0.73      3198
weighted avg       0.88      0.86      0.87      3198



### SVM

In [30]:
from sklearn import svm
SVMM = svm.LinearSVC(class_weight='balanced',verbose=0, random_state=None,max_iter=1000)  

In [31]:
SVMM.fit(x_train,y_train)
predictions6 = SVMM.predict(x_test)


In [32]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, predictions6))
print('\n','ACCURACY','\n',accuracy_score(y_test, predictions6))
print('\n','REPORT','\n',classification_report(y_test,predictions6))


 CONFUSION MATRIX 
 [[2758   30]
 [ 212  198]]

 ACCURACY 
 0.9243277048155097

 REPORT 
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      2788
           1       0.87      0.48      0.62       410

    accuracy                           0.92      3198
   macro avg       0.90      0.74      0.79      3198
weighted avg       0.92      0.92      0.91      3198

