In [3]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import accuracy_score,precision_score,f1_score,roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
import warnings

In [4]:
df = pd.read_csv('data/train.csv')

In [5]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


Defining input and Output features of the data

In [6]:
y = df['label']
X = df.drop('label',axis=1)

copying the value of X in messages for feature engineering

In [7]:
messages = X.copy()
messages.reset_index(inplace=True)

Feature engineering

In [8]:
import re 
import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lm  = WordNetLemmatizer()
corpus =[]
for i in range(0,len(messages)):
    review = re.sub('[^A-za-z]','',str(messages['title'][i]))
    review = review.lower()
    review = review.split()
    review = [lm.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ''.join(review)
    corpus.append(review)


Encoding the data 

In [9]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)

# Reshape the data to be a single column with each word as a row
corpus_array = [[word] for word in corpus]

# Fit and transform the data
onehot_encoded = onehot_encoder.fit_transform(corpus_array)
onehot = onehot_encoded.astype(np.float32)



In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train,X_test,y_train,y_test = train_test_split(onehot,y,train_size=0.3,random_state=21)

In [12]:
def evaluate_model(true, predicted):
    f1 = f1_score(true, predicted)
    roc_score = roc_auc_score(true, predicted)
    accuracy = accuracy_score(true, predicted)
    return f1, roc_score, accuracy

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

models = {
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "SVC": SVC(),
}

model_list = []
accuracy = []

for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate train and test dataset
    model_train_f1, model_train_roc, model_train_accuracy = evaluate_model(y_train, y_train_pred)
    model_test_f1, model_test_roc, model_test_accuracy = evaluate_model(y_test, y_test_pred)

    print(model_name)
    
    print("Model performance for training set")
    print("F1 Score (Training): {:.4f}".format(model_train_f1))
    print("ROC Score (Training): {:.4f}".format(model_train_roc))
    print("Accuracy Score (Training): {:.4f}".format(model_train_accuracy))
    print("----------------------------")
    
    print("Model Performance for test data")
    print("F1 Score (Test): {:.4f}".format(model_test_f1))
    print("ROC Score (Test): {:.4f}".format(model_test_roc))
    print("Accuracy Score (Test): {:.4f}".format(model_test_accuracy))
    
    model_list.append(model_name)
    accuracy.append(model_test_accuracy)

    print('=' * 35)
    print('\n')


KNeighborsClassifier
Model performance for training set
F1 Score (Training): 0.9243
ROC Score (Training): 0.9171
Accuracy Score (Training): 0.9176
----------------------------
Model Performance for test data
F1 Score (Test): 0.6662
ROC Score (Test): 0.5000
Accuracy Score (Test): 0.4995


DecisionTree
Model performance for training set
F1 Score (Training): 1.0000
ROC Score (Training): 1.0000
Accuracy Score (Training): 1.0000
----------------------------
Model Performance for test data
F1 Score (Test): 0.2035
ROC Score (Test): 0.5566
Accuracy Score (Test): 0.5571


RandomForestClassifier
Model performance for training set
F1 Score (Training): 1.0000
ROC Score (Training): 1.0000
Accuracy Score (Training): 1.0000
----------------------------
Model Performance for test data
F1 Score (Test): 0.2035
ROC Score (Test): 0.5566
Accuracy Score (Test): 0.5571


AdaBoost
Model performance for training set
F1 Score (Training): 0.1834
ROC Score (Training): 0.5505
Accuracy Score (Training): 0.5476
----