In [1846]:
import numpy as np
import pandas as pd

In [1847]:
data = pd.read_csv("Revised Fin data.csv")

In [1848]:
data.columns

Index(['Org_ID', 'Date_published', 'Enc_ID', 'Headline', 'Synopsis',
       'Headline + Synop', 'Full_text', 'Final Status'],
      dtype='object')

In [1849]:
testsize = 0.2

In [1850]:
fd = data[["Headline + Synop","Full_text","Final Status"]]

In [1851]:
fd['Final Status'].value_counts()

Positive     215
Negative     184
Positive       1
Name: Final Status, dtype: int64

In [1852]:
# Label the final status 
fd['Final Status']=fd['Final Status'].apply(lambda x: 0 if  x=="Negative" else 1)
fd.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fd['Final Status']=fd['Final Status'].apply(lambda x: 0 if  x=="Negative" else 1)


Unnamed: 0,Headline + Synop,Full_text,Final Status
0,"Banks holding on to subsidy share, say payment...",ReutersPayments companies and banks are at log...,0
1,Digitally ready Bank of Baroda aims to click o...,AgenciesThe bank presently has 20 million acti...,1
2,Karnataka attracted investment commitment of R...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,1
3,Splitting of provident fund accounts may be de...,Getty ImagesThe budget for FY22 had imposed in...,0
4,Irdai weighs proposal to privatise Insurance I...,AgenciesThere is a view in the insurance indus...,1


In [1853]:
fd['Final Status'].value_counts()

1    216
0    184
Name: Final Status, dtype: int64

In [1854]:
# Importing stopwords from nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [1855]:
# Removing some words from the stopwords list as they contain a meaning for our problem statement 
for i in ['further','over','under','up','down','above','below']:
  stopwords.remove(i)

## Full text 

In [1856]:
# An empty column for the preprocessed and marked text which is ready for embedding
fd['text_processed']=''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fd['text_processed']=''


In [1857]:
articles=[]
for i in range(0,fd.shape[0]):
    articles.append(fd.iloc[i])

In [1858]:
nfd = fd # Just renaming for ease of copying

In [1859]:
# Preprocessing

import re
import string
def preprocess(article):
  words = [word for word in str(article['Full_text']).split() if word not in stopwords] # List Comprehension
  text = " ".join(words)
  #Remove all the punctuation from the text
  text = re.sub(r'[^\w\s]', '', text)
  #Removes all numericals leaving the alphabets
  text = ''.join([i for i in text if not i.isdigit()])
  # Removing multiple spaces in the text and replacing them with a single space
  text = re.sub(' +', ' ', text)
  text = text.lower()
  article.text_processed = text

  return ("Cleaning Done!")

In [1860]:
import time
from timeit import default_timer as timer
from joblib import Parallel, delayed

start=timer()
res = Parallel(
    backend='threading',
    n_jobs=-1
)(delayed(preprocess)(x) for x in articles)
print('Done')
end=timer()
print(((end-start)/60),"mins")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()


Done
0.006797264999992573 mins


In [1861]:
nfd = pd.DataFrame(articles)
nfd.head()

Unnamed: 0,Headline + Synop,Full_text,Final Status,text_processed
0,"Banks holding on to subsidy share, say payment...",ReutersPayments companies and banks are at log...,0,reuterspayments companies banks loggerheads ov...
1,Digitally ready Bank of Baroda aims to click o...,AgenciesThe bank presently has 20 million acti...,1,agenciesthe bank presently million active user...
2,Karnataka attracted investment commitment of R...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,1,ptikarnataka chief minister basavaraj bommaika...
3,Splitting of provident fund accounts may be de...,Getty ImagesThe budget for FY22 had imposed in...,0,getty imagesthe budget fy imposed incometax in...
4,Irdai weighs proposal to privatise Insurance I...,AgenciesThere is a view in the insurance indus...,1,agenciesthere view insurance industry iib able...


In [1906]:
from sklearn.model_selection import train_test_split

X = nfd['text_processed']
y = nfd['Final Status']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=testsize, random_state=42)

In [1907]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])



#Decision Tree 
text_clf_DT = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', DecisionTreeClassifier(max_depth = 4)),
])

In [1908]:
text_clf_nb.fit(X_train, y_train)

In [1909]:
# Form a prediction set
predictions = text_clf_nb.predict(X_test)

In [1910]:
# Print a classification report
print("Naive Bayes:")
print("\n")
print(metrics.classification_report(y_test,predictions))

Naive Bayes:


              precision    recall  f1-score   support

           0       0.70      0.35      0.47        40
           1       0.57      0.85      0.68        40

    accuracy                           0.60        80
   macro avg       0.63      0.60      0.57        80
weighted avg       0.63      0.60      0.57        80



In [1911]:
text_clf_DT.fit(X_train, y_train)

In [1912]:
text_clf_DT.score(X_train, y_train)

0.86875

In [1913]:
# Form a prediction set
predictions = text_clf_DT.predict(X_test)

In [1914]:
# Print a classification report
print("Decision Tree:")
print("\n")
print(metrics.classification_report(y_test,predictions))

Decision Tree:


              precision    recall  f1-score   support

           0       0.71      0.50      0.59        40
           1       0.62      0.80      0.70        40

    accuracy                           0.65        80
   macro avg       0.66      0.65      0.64        80
weighted avg       0.66      0.65      0.64        80



In [1915]:
# importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier

In [1916]:
text_clf_rf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(max_depth = 5)),
])


In [1917]:
text_clf_rf.fit(X_train, y_train)

In [1918]:
text_clf_rf.score(X_train, y_train)

0.875

In [1919]:
text_clf_rf.score(X_test, y_test)

0.625

In [1920]:
from sklearn.ensemble import BaggingClassifier

In [1921]:
text_clf_b = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BaggingClassifier(DecisionTreeClassifier(max_depth = 5))),
])

In [1922]:
text_clf_b.fit(X_train, y_train)

In [1923]:
text_clf_b.score(X_train, y_train)

0.94375

In [1924]:
text_clf_b.score(X_test, y_test)

0.6375

In [1926]:
text_clf_b = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BaggingClassifier(DecisionTreeClassifier(max_depth = 4))),
])

In [1928]:
text_clf_b.fit(X_train, y_train)

In [1929]:
text_clf_b.score(X_train, y_train)

0.871875

In [1930]:
text_clf_b.score(X_test, y_test)

0.7125

## Headlinesyn 

In [1881]:
# An empty column for the preprocessed and marked text which is ready for embedding
fd['headsyntext_processed']=''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fd['headsyntext_processed']=''


In [1882]:
articles=[]
for i in range(0,fd.shape[0]):
    articles.append(fd.iloc[i])

In [1883]:
# Preprocessing

import re
import string
def preprocess(article):
  words = [word for word in str(article['Headline + Synop']).split() if word not in stopwords] # List Comprehension
  text = " ".join(words)
  #Remove all the punctuation from the text
  text = re.sub(r'[^\w\s]', '', text)
  #Removes all numericals leaving the alphabets
  text = ''.join([i for i in text if not i.isdigit()])
  # Removing multiple spaces in the text and replacing them with a single space
  text = re.sub(' +', ' ', text)
  text = text.lower()
  article.headsyntext_processed = text

  return ("Cleaning Done!")


In [1884]:
import time
from timeit import default_timer as timer
from joblib import Parallel, delayed

start=timer()
res = Parallel(
    backend='threading',
    n_jobs=-1
)(delayed(preprocess)(x) for x in articles)
print('Done')
end=timer()
print(((end-start)/60),"mins")

Done
0.0015799649999886848 mins


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()


In [1885]:
df = pd.DataFrame(articles)
df.head()

Unnamed: 0,Headline + Synop,Full_text,Final Status,text_processed,headsyntext_processed
0,"Banks holding on to subsidy share, say payment...",ReutersPayments companies and banks are at log...,0,,banks holding subsidy share say payments firms...
1,Digitally ready Bank of Baroda aims to click o...,AgenciesThe bank presently has 20 million acti...,1,,digitally ready bank baroda aims click loans a...
2,Karnataka attracted investment commitment of R...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,1,,karnataka attracted investment commitment rs c...
3,Splitting of provident fund accounts may be de...,Getty ImagesThe budget for FY22 had imposed in...,0,,splitting provident fund accounts may delayed ...
4,Irdai weighs proposal to privatise Insurance I...,AgenciesThere is a view in the insurance indus...,1,,irdai weighs proposal privatise insurance info...


In [1886]:
from sklearn.model_selection import train_test_split

X = df['headsyntext_processed']
y =df['Final Status']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=testsize, random_state=42)

In [1887]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

#Decision Tree 
text_clf_DT = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', DecisionTreeClassifier(max_depth = 5)),
])

In [1888]:
text_clf_nb.fit(X_train, y_train)

In [1889]:
# Form a prediction set
predictions = text_clf_nb.predict(X_test)

In [1890]:
# Print a classification report
print("Naive Bayes:")
print("\n")
print(metrics.classification_report(y_test,predictions))

Naive Bayes:


              precision    recall  f1-score   support

           0       0.75      0.53      0.62        40
           1       0.63      0.82      0.72        40

    accuracy                           0.68        80
   macro avg       0.69      0.68      0.67        80
weighted avg       0.69      0.68      0.67        80



In [1891]:
text_clf_DT.fit(X_train, y_train)

In [1892]:
text_clf_DT.score(X_train, y_train)

0.778125

In [1893]:
text_clf_DT.score(X_test, y_test)

0.65

In [1894]:
# Form a prediction set
predictions = text_clf_DT.predict(X_test)

In [1895]:
# Print a classification report
print("Decision Tree:")
print("\n")
print(metrics.classification_report(y_test,predictions))

Decision Tree:


              precision    recall  f1-score   support

           0       0.93      0.33      0.48        40
           1       0.59      0.97      0.74        40

    accuracy                           0.65        80
   macro avg       0.76      0.65      0.61        80
weighted avg       0.76      0.65      0.61        80



In [1896]:
# importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier

In [1897]:
text_clf_rf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(max_depth = 5)),
])


In [1898]:
text_clf_rf.fit(X_train, y_train)

In [1899]:
text_clf_rf.score(X_train, y_train)

0.76875

In [1900]:
text_clf_rf.score(X_test, y_test)

0.6125

In [1901]:
from sklearn.ensemble import BaggingClassifier

In [1902]:
text_clf_b = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BaggingClassifier(DecisionTreeClassifier(max_depth = 6))),
])

In [1903]:
text_clf_b.fit(X_train, y_train)

In [1904]:
text_clf_b.score(X_train, y_train)

0.88125

In [1905]:
text_clf_b.score(X_test, y_test)

0.7375