In [2]:
import pandas as pd
import warnings

In [3]:
import numpy as np

In [5]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier ,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score,accuracy_score, classification_report, f1_score ,confusion_matrix


In [7]:
train_data = pd.read_excel("C:\\Users\\SONY\\Desktop\\WORK\\PROJECT\\train.xlsx")  ##train data
test_data =  pd.read_excel("C:\\Users\\SONY\\Desktop\\WORK\\PROJECT\\Test.xlsx")   ## test data


In [8]:
train_data.isna().any()

ID        False
Text      False
Labels    False
dtype: bool

In [9]:
train_data=train_data.dropna(axis=0)

In [10]:
test_data.isna().any()

ID        False
Text      False
Labels    False
dtype: bool

In [11]:
test_data=test_data.dropna(axis=0)

In [12]:
train = list(train_data['Text'])
test = list(test_data['Text'])
y_train = list(train_data['Labels'])
tst_id = pd.DataFrame(test_data['ID'])
y_test =list(test_data['Labels'])

In [13]:
vectorizer = TfidfVectorizer(min_df =1,stop_words='english',use_idf=True,analyzer='word',
                             ngram_range=(1,1),max_features=15000)
x_train = vectorizer.fit_transform(train)
x_test  = vectorizer.transform(test)

In [14]:
x_test.shape

(320, 15000)

### LOGISTIC REGRESSION

In [15]:
logisticRegr = LogisticRegression(solver='liblinear',class_weight='balanced',random_state=5,tol=0.001,max_iter=1000)
logisticRegr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=5, solver='liblinear', tol=0.001, verbose=0,
                   warm_start=False)

In [16]:
predictions = logisticRegr.predict(x_test)

In [17]:
cm = confusion_matrix(y_test, predictions)
print(cm)


[[276   3]
 [ 15  26]]


In [18]:
accuracy_score(y_test, predictions)

0.94375

In [19]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       279
           1       0.90      0.63      0.74        41

    accuracy                           0.94       320
   macro avg       0.92      0.81      0.86       320
weighted avg       0.94      0.94      0.94       320



### RANDOM FOREST

In [20]:
rand = RandomForestClassifier(n_estimators=100,criterion='entropy',max_features=None,class_weight='balanced')
rand.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=None, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [21]:
prediction2 = rand.predict(x_test)

In [22]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, prediction2))
print('\n','ACCURACY','\n',accuracy_score(y_test, prediction2))
print('\n','REPORT','\n',classification_report(prediction2,y_test))


 CONFUSION MATRIX 
 [[279   0]
 [ 28  13]]

 ACCURACY 
 0.9125

 REPORT 
               precision    recall  f1-score   support

           0       1.00      0.91      0.95       307
           1       0.32      1.00      0.48        13

    accuracy                           0.91       320
   macro avg       0.66      0.95      0.72       320
weighted avg       0.97      0.91      0.93       320



### ADA-BOOST

In [23]:
ada = AdaBoostClassifier(algorithm='SAMME.R',learning_rate=0.01)
ada.fit(x_train,y_train)  

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.01,
                   n_estimators=50, random_state=None)

In [24]:
predictions3 = ada.predict(x_test)

In [25]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, predictions3))
print('\n','ACCURACY','\n',accuracy_score(y_test, predictions3))
print('\n','REPORT','\n',classification_report(y_test,predictions3))


 CONFUSION MATRIX 
 [[278   1]
 [ 24  17]]

 ACCURACY 
 0.921875

 REPORT 
               precision    recall  f1-score   support

           0       0.92      1.00      0.96       279
           1       0.94      0.41      0.58        41

    accuracy                           0.92       320
   macro avg       0.93      0.71      0.77       320
weighted avg       0.92      0.92      0.91       320



### NEURAL NETWORK SKLEARN

In [26]:
from sklearn.neural_network import MLPClassifier

In [27]:
MLP = MLPClassifier(alpha=0.0001,hidden_layer_sizes=(5,2), random_state=1,solver='adam',
                    activation='relu')


In [28]:
MLP.fit(x_train,y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [29]:
predictions4 = MLP.predict(x_test)

In [30]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, predictions4))
print('\n','ACCURACY','\n',accuracy_score(y_test, predictions4))
print('\n','REPORT','\n',classification_report(y_test,predictions4))


 CONFUSION MATRIX 
 [[279   0]
 [ 41   0]]

 ACCURACY 
 0.871875

 REPORT 
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       279
           1       0.00      0.00      0.00        41

    accuracy                           0.87       320
   macro avg       0.44      0.50      0.47       320
weighted avg       0.76      0.87      0.81       320



  'precision', 'predicted', average, warn_for)


### KNN

In [31]:
from sklearn.neighbors import KNeighborsClassifier

In [32]:
KN = KNeighborsClassifier( algorithm='brute')
KN.fit(x_train,y_train)
predictions5 = KN.predict(x_test)


In [33]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, predictions5))
print('\n','ACCURACY','\n',accuracy_score(y_test, predictions5))
print('\n','REPORT','\n',classification_report(y_test,predictions5))


 CONFUSION MATRIX 
 [[257  22]
 [  8  33]]

 ACCURACY 
 0.90625

 REPORT 
               precision    recall  f1-score   support

           0       0.97      0.92      0.94       279
           1       0.60      0.80      0.69        41

    accuracy                           0.91       320
   macro avg       0.78      0.86      0.82       320
weighted avg       0.92      0.91      0.91       320



### SVM

In [34]:
from sklearn import svm
SVMM = svm.LinearSVC(class_weight='balanced',verbose=0, random_state=None,max_iter=1000)  

In [35]:
SVMM.fit(x_train,y_train)
predictions6 = SVMM.predict(x_test)


In [36]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, predictions6))
print('\n','ACCURACY','\n',accuracy_score(y_test, predictions6))
print('\n','REPORT','\n',classification_report(y_test,predictions6))


 CONFUSION MATRIX 
 [[276   3]
 [ 17  24]]

 ACCURACY 
 0.9375

 REPORT 
               precision    recall  f1-score   support

           0       0.94      0.99      0.97       279
           1       0.89      0.59      0.71        41

    accuracy                           0.94       320
   macro avg       0.92      0.79      0.84       320
weighted avg       0.94      0.94      0.93       320

