In [1]:
### Import the LogisticRegression and other necessary Classes

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score




In [2]:
df = pd.read_csv('all.csv')
print(df)

      Unnamed: 0     type                                    content_cleaned  \
0              1  article  speaker robin vos r rochester senate majority ...   
1             14  article                                       page looking   
2             15  article  contrary narrative pushed mainstream covid 19 ...   
3             24  article  programming alert exclusive documentary origin...   
4             30  article  buffalo ny wivbmayor byron brown handing mask ...   
...          ...      ...                                                ...   
1742        1573  article  rush transcript week george stephanopoulos air...   
1743        1576  article  unprecedented moment american history need unp...   
1744        1578  article  gov greg abbott tuesday issued amount statewid...   
1745        1586  article  pulled trajectory chart work new version added...   
1746        1588  article  fox business lou dobbs relentlessly grilled tr...   

      label  
0         1  
1         1

In [3]:
# check the target label to see if it is an imbalanced dataset
df['label'].value_counts()

0    1397
1     350
Name: label, dtype: int64

In [4]:
### Create training & test set

X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['content_cleaned'], df['label'], test_size=0.2, random_state=0)

In [5]:
### Create TfidfVectorizer. Fit and transform.
vectorizer = TfidfVectorizer(ngram_range=(1, 1))

# learn the 'vocabulary' of the training data & transform training data into a 'document-term matrix'
X_train = vectorizer.fit_transform(X_train_raw)

# learn the 'vocabulary' of the test data & transform test data into a 'document-term matrix'
X_test = vectorizer.transform(X_test_raw)

In [6]:
# instantiate a logistic regression model
classifier = LogisticRegression()

# train the model using X_train
classifier.fit(X_train, y_train)

# make predictions for X_test
predictions = classifier.predict(X_test)


In [7]:
#model performance before adjusting for imbalanced dataset

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       278
           1       1.00      0.40      0.57        72

    accuracy                           0.88       350
   macro avg       0.93      0.70      0.75       350
weighted avg       0.89      0.88      0.86       350



In [8]:
print('Accuracy:', accuracy_score(y_test, predictions))
print('Precision:', precision_score(y_test, predictions))
print('Recall:', recall_score(y_test, predictions))
print('F1:', f1_score(y_test, predictions))

Accuracy: 0.8771428571428571
Precision: 1.0
Recall: 0.4027777777777778
F1: 0.5742574257425743


In [9]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 
  

Before OverSampling, counts of label '1': 278
Before OverSampling, counts of label '0': 1119 



In [9]:
# pip install imblearn (if you don't have imblearn in your system) 
#pip install imblearn 

In [10]:
# import SMOTE module from imblearn library 

from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(X_train, y_train) 

In [11]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 

After OverSampling, the shape of train_X: (2238, 7950)
After OverSampling, the shape of train_y: (2238,) 

After OverSampling, counts of label '1': 1119
After OverSampling, counts of label '0': 1119


In [12]:
#model performance after adjusting for imbalanced dataset

classifier1 = LogisticRegression() 
classifier1.fit(X_train_res, y_train_res) 
predictions1 = classifier1.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, predictions1)) 

              precision    recall  f1-score   support

           0       0.94      0.99      0.97       278
           1       0.96      0.76      0.85        72

    accuracy                           0.95       350
   macro avg       0.95      0.88      0.91       350
weighted avg       0.95      0.95      0.94       350



In [13]:
print('Accuracy:', accuracy_score(y_test, predictions1))
print('Precision:', precision_score(y_test, predictions1))
print('Recall:', recall_score(y_test, predictions1))
print('f1:', f1_score(y_test, predictions1))

Accuracy: 0.9457142857142857
Precision: 0.9649122807017544
Recall: 0.7638888888888888
f1: 0.8527131782945737
