In [1]:
import pandas as pd

In [2]:
#importing the dataframe
news_df = pd.read_csv("../Data/Clean_data.csv")

In [3]:
news_df.head()

Unnamed: 0,Title,Is_SentimentTitle_Positive
0,look health chinese economy,1
1,nouriel roubini global economy,0
2,fire claim barn hancock county,0
3,big datum internet thing add uk economy re...,1
4,china share economy generate usd,0


## Feature Selection

In [4]:
# Changing the word to it's similar more common word
import spacy
from collections import Counter
import en_core_web_lg 
import json

def consolidate_words(textlist):
    nlp = en_core_web_lg.load()
    
    list_of_text = textlist.copy()
    
    wordcounts = Counter(' '.join(list_of_text).split())    
    low_words = [k for k, v in wordcounts.items() if v <= 1]
    other_words = [k for k, v in wordcounts.items() if v > 1]
    
    tokens = nlp(' '.join(other_words))
    
    replacement_dict = {}
    
    for word in low_words:

        word_token = nlp(word)
        max_similarity = 0.8

        for tk in tokens:
            # find the maximum similarity above threshold
            sim_score = word_token.similarity(tk)

            if 1 > sim_score > max_similarity:
                replacement_dict[word] = tk.text
                max_similarity = sim_score
                            
        try:
            replacement_dict[word]      
        except KeyError:
            continue

    return replacement_dict

In [5]:
list_of_words = [word for line in list(news_df["Title"]) for word in line.split()]

In [6]:
#consolidate_words_dict = consolidate_words(list_of_words)
#with open('replacement.json', 'w') as fp:
#    json.dump(consolidate_words_dict, fp)

In [7]:
# Opening JSON file
with open('replacement.json') as json_file:
    consolidate_words_dict = json.load(json_file)

In [8]:
def replace_consolidate(text):
    text = text.strip()
    str_=""
    for word in text.split(" "):
        try:
            word = consolidate_words_dict[word]
        except:
            pass
        str_ += word +" "
    return str_

In [9]:
#news_df['Title'] = news_df['Title'].apply(replace_consolidate)

## Split data into train and test sets

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(news_df['Title'], news_df['Is_SentimentTitle_Positive'], 
                                                    train_size=0.8, 
                                                    random_state=42)

## Vectorization

In [11]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

vectorizer = TfidfVectorizer(max_features=4000, min_df=7, max_df=0.7, stop_words=stopwords.words('english'))
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test)

# Feature Selection
chi2_selector = SelectKBest(chi2, k=2000)
X_train = chi2_selector.fit_transform(X_train, y_train)
X_test = chi2_selector.transform(X_test)
    

## Training the model

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.svm import SVC

SGDC = SGDClassifier()
SVCM = SVC()
LR = LogisticRegression()
RFC = RandomForestClassifier(n_estimators=300, random_state=0)
MNB = MultinomialNB()

# Linear Regression
LR.fit(X_train, y_train)
LR_Model = LR.predict(X_test)
print("\nLinear Regression Algorithm\n")
print("Confusion Matrix: \n",confusion_matrix(y_test,LR_Model))
print("Classification Report: \n",classification_report(y_test,LR_Model))
print("Accuracy Score: \n",accuracy_score(y_test, LR_Model))

# Stochastic Gradient Descent
SGDC.fit(X_train, y_train)
SGDC_Model = SGDC.predict(X_test)
print("\nStochastic Gradient Descent Algorithm\n")
print("Confusion Matrix: \n",confusion_matrix(y_test,SGDC_Model))
print("Classification Report: \n",classification_report(y_test,SGDC_Model))
print("Accuracy Score: \n",accuracy_score(y_test, SGDC_Model))

# Support Vector Model
SVCM.fit(X_train, y_train)
SVCM_Model = SVCM.predict(X_test)
print("\nSupport Vector Method Algorithm\n")
print("Confusion Matrix: \n",confusion_matrix(y_test,SVCM_Model))
print("Classification Report: \n",classification_report(y_test,SVCM_Model))
print("Accuracy Score: \n",accuracy_score(y_test, SVCM_Model))

# Random Forest Classifier 
RFC.fit(X_train, y_train)
RFC_Model = RFC.predict(X_test)
print("\nRandom Forest Classifier  Algorithm\n")
print("Confusion Matrix: \n",confusion_matrix(y_test,RFC_Model))
print("Classification Report: \n",classification_report(y_test,RFC_Model))
print("Accuracy Score: \n",accuracy_score(y_test, RFC_Model))

# Multinomial Naive Bayes 
MNB.fit(X_train, y_train)
MNB_Model = MNB.predict(X_test)
print("\nMultinomial Naive Bayes Algorithm\n")
print("Confusion Matrix: \n",confusion_matrix(y_test,MNB_Model))
print("Classification Report: \n",classification_report(y_test,MNB_Model))
print("Accuracy Score: \n",accuracy_score(y_test, MNB_Model))


Linear Regression Algorithm

Confusion Matrix: 
 [[3513 1029]
 [1256 2880]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.77      0.75      4542
           1       0.74      0.70      0.72      4136

    accuracy                           0.74      8678
   macro avg       0.74      0.73      0.74      8678
weighted avg       0.74      0.74      0.74      8678

Accuracy Score: 
 0.7366904816778059

Stochastic Gradient Descent Algorithm

Confusion Matrix: 
 [[3461 1081]
 [1238 2898]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.76      0.75      4542
           1       0.73      0.70      0.71      4136

    accuracy                           0.73      8678
   macro avg       0.73      0.73      0.73      8678
weighted avg       0.73      0.73      0.73      8678

Accuracy Score: 
 0.7327725282323115
