In [1466]:
import pandas as pd
data1 = pd.read_csv("Revised Fin data.csv")
data2 = pd.read_csv("Master data - Sample 2 Non Financial_ copy.csv")

In [1467]:
data1 = data1[['Headline + Synop', 'Full_text', 'Final Status']]

In [1468]:
data2 = data2[['Headline + Synop', 'Full_text','Final Status']]

In [1469]:
data = pd.concat([data1,data2],axis = 0)

In [1470]:
data.shape

(800, 3)

In [1471]:
testsize = 0.2

In [1472]:
data['Final Status'].value_counts()

Positive     456
Negative     343
Positive       1
Name: Final Status, dtype: int64

In [1473]:
# Label the final status 
data['Final Status']=data['Final Status'].apply(lambda x: 0 if  x=="Negative" else 1)
data.head()

Unnamed: 0,Headline + Synop,Full_text,Final Status
0,"Banks holding on to subsidy share, say payment...",ReutersPayments companies and banks are at log...,0
1,Digitally ready Bank of Baroda aims to click o...,AgenciesThe bank presently has 20 million acti...,1
2,Karnataka attracted investment commitment of R...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,1
3,Splitting of provident fund accounts may be de...,Getty ImagesThe budget for FY22 had imposed in...,0
4,Irdai weighs proposal to privatise Insurance I...,AgenciesThere is a view in the insurance indus...,1


In [1474]:
data['Final Status'].value_counts()

1    457
0    343
Name: Final Status, dtype: int64

In [1475]:
# Importing stopwords from nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [1476]:
# Removing some words from the stopwords list as they contain a meaning for our problem statement 
for i in ['further','over','under','up','down','above','below']:
  stopwords.remove(i)

## Full Text

In [1477]:
# An empty column for the preprocessed and marked text which is ready for embedding
data['text_processed']=''

In [1478]:
articles=[]
for i in range(0,data.shape[0]):
    articles.append(data.iloc[i])

In [1479]:
nfd = data

In [1480]:
# Preprocessing

import re
import string
def preprocess(article):
  words = [word for word in str(article['Full_text']).split() if word not in stopwords] # List Comprehension
  text = " ".join(words)
  #Remove all the punctuation from the text
  text = re.sub(r'[^\w\s]', '', text)
  #Removes all numericals leaving the alphabets
  text = ''.join([i for i in text if not i.isdigit()])
  # Removing multiple spaces in the text and replacing them with a single space
  text = re.sub(' +', ' ', text)
  text = text.lower()
  article.text_processed = text

  return ("Cleaning Done!")

In [1481]:
import time
from timeit import default_timer as timer
from joblib import Parallel, delayed

start=timer()
res = Parallel(
    backend='threading',
    n_jobs=-1
)(delayed(preprocess)(x) for x in articles)
print('Done')
end=timer()
print(((end-start)/60),"mins")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()


Done
0.015019043333328834 mins


In [1482]:
nfd = pd.DataFrame(articles)
nfd.head()

Unnamed: 0,Headline + Synop,Full_text,Final Status,text_processed
0,"Banks holding on to subsidy share, say payment...",ReutersPayments companies and banks are at log...,0,reuterspayments companies banks loggerheads ov...
1,Digitally ready Bank of Baroda aims to click o...,AgenciesThe bank presently has 20 million acti...,1,agenciesthe bank presently million active user...
2,Karnataka attracted investment commitment of R...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,1,ptikarnataka chief minister basavaraj bommaika...
3,Splitting of provident fund accounts may be de...,Getty ImagesThe budget for FY22 had imposed in...,0,getty imagesthe budget fy imposed incometax in...
4,Irdai weighs proposal to privatise Insurance I...,AgenciesThere is a view in the insurance indus...,1,agenciesthere view insurance industry iib able...


In [1483]:
from sklearn.model_selection import train_test_split

X = nfd['text_processed']
y = nfd['Final Status']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=testsize, random_state=42)

In [1484]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
#from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Logistic Regression:
#text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     #('clf', LogisticRegression()),
#])

#Decision Tree 
text_clf_DT = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', DecisionTreeClassifier(max_depth = 3)),
])

In [1485]:
text_clf_nb.fit(X_train, y_train)

In [1486]:
# Form a prediction set
predictions = text_clf_nb.predict(X_test)

In [1487]:
# Print a classification report
print("Naive Bayes:")
print("\n")
print(metrics.classification_report(y_test,predictions))

Naive Bayes:


              precision    recall  f1-score   support

           0       0.58      0.29      0.38        63
           1       0.65      0.87      0.74        97

    accuracy                           0.64       160
   macro avg       0.62      0.58      0.56       160
weighted avg       0.62      0.64      0.60       160



In [1488]:
text_clf_DT.fit(X_train, y_train)

In [1489]:
text_clf_DT.score(X_train, y_train)

0.7390625

In [1490]:
# Form a prediction set
predictions = text_clf_DT.predict(X_test)

In [1491]:
# Print a classification report
print("Decision Tree:")
print("\n")
print(metrics.classification_report(y_test,predictions))

Decision Tree:


              precision    recall  f1-score   support

           0       0.61      0.44      0.51        63
           1       0.69      0.81      0.75        97

    accuracy                           0.67       160
   macro avg       0.65      0.63      0.63       160
weighted avg       0.66      0.67      0.66       160



In [1492]:
# importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier

In [1493]:
text_clf_rf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(max_depth = 3)),
])


In [1494]:
text_clf_rf.fit(X_train, y_train)

In [1495]:
text_clf_rf.score(X_train, y_train)

0.625

In [1496]:
text_clf_rf.score(X_test, y_test)

0.625

In [1497]:
from sklearn.ensemble import BaggingClassifier

In [1498]:
text_clf_b = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BaggingClassifier(DecisionTreeClassifier(max_depth = 4))),
])

In [1499]:
text_clf_b.fit(X_train, y_train)

In [1500]:
text_clf_b.score(X_train, y_train)

0.7921875

In [1501]:
text_clf_b.score(X_test, y_test)

0.6625

## Headsyn

In [1502]:
# An empty column for the preprocessed and marked text which is ready for embedding
data['headsyntext_processed']=''

In [1503]:
articles=[]
for i in range(0,fd.shape[0]):
    articles.append(fd.iloc[i])

In [1504]:
# Preprocessing

import re
import string
def preprocess(article):
  words = [word for word in str(article['Headline + Synop']).split() if word not in stopwords] # List Comprehension
  text = " ".join(words)
  #Remove all the punctuation from the text
  text = re.sub(r'[^\w\s]', '', text)
  #Removes all numericals leaving the alphabets
  text = ''.join([i for i in text if not i.isdigit()])
  # Removing multiple spaces in the text and replacing them with a single space
  text = re.sub(' +', ' ', text)
  text = text.lower()
  article.headsyntext_processed = text

  return ("Cleaning Done!")

In [1505]:
import time
from timeit import default_timer as timer
from joblib import Parallel, delayed

start=timer()
res = Parallel(
    backend='threading',
    n_jobs=-1
)(delayed(preprocess)(x) for x in articles)
print('Done')
end=timer()
print(((end-start)/60),"mins")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()


Done
0.004806891666673133 mins


In [1506]:
df = pd.DataFrame(articles)
df.head()

Unnamed: 0,Headline + Synop,Full_text,Final Status,text_processed,headsyntext_processed
0,"Banks holding on to subsidy share, say payment...",ReutersPayments companies and banks are at log...,0,,banks holding subsidy share say payments firms...
1,Digitally ready Bank of Baroda aims to click o...,AgenciesThe bank presently has 20 million acti...,1,,digitally ready bank baroda aims click loans a...
2,Karnataka attracted investment commitment of R...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,1,,karnataka attracted investment commitment rs c...
3,Splitting of provident fund accounts may be de...,Getty ImagesThe budget for FY22 had imposed in...,0,,splitting provident fund accounts may delayed ...
4,Irdai weighs proposal to privatise Insurance I...,AgenciesThere is a view in the insurance indus...,1,,irdai weighs proposal privatise insurance info...


In [1507]:
from sklearn.model_selection import train_test_split

X = df['headsyntext_processed']
y =df['Final Status']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=testsize, random_state=42)

In [1508]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
#from sklearn.linear_model import LogisticRegression

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])


#Decision Tree 
text_clf_DT = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', DecisionTreeClassifier(max_depth = 4)),
])

In [1509]:
text_clf_nb.fit(X_train, y_train)

In [1510]:
# Form a prediction set
predictions = text_clf_nb.predict(X_test)

In [1511]:
# Print a classification report
print("Naive Bayes:")
print("\n")
print(metrics.classification_report(y_test,predictions))

Naive Bayes:


              precision    recall  f1-score   support

           0       0.71      0.51      0.59        63
           1       0.73      0.87      0.79        97

    accuracy                           0.73       160
   macro avg       0.72      0.69      0.69       160
weighted avg       0.72      0.72      0.71       160



In [1512]:
text_clf_DT.fit(X_train, y_train)

In [1513]:
text_clf_DT.score(X_train, y_train)

0.7203125

In [1514]:
text_clf_DT.score(X_test, y_test)

0.6125

In [1515]:
# Form a prediction set
predictions = text_clf_DT.predict(X_test)

In [1516]:
# Print a classification report
print("Decision Tree:")
print("\n")
print(metrics.classification_report(y_test,predictions))

Decision Tree:


              precision    recall  f1-score   support

           0       0.51      0.33      0.40        63
           1       0.65      0.79      0.71        97

    accuracy                           0.61       160
   macro avg       0.58      0.56      0.56       160
weighted avg       0.59      0.61      0.59       160



In [1517]:
# importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier

In [1518]:
text_clf_rf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(max_depth = 4)),
])


In [1519]:
text_clf_rf.fit(X_train, y_train)

In [1520]:
text_clf_rf.score(X_train, y_train)

0.6296875

In [1521]:
text_clf_rf.score(X_test, y_test)

0.64375

In [1522]:
from sklearn.ensemble import BaggingClassifier

In [1523]:
text_clf_b = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BaggingClassifier(DecisionTreeClassifier(max_depth = 3))),
])

In [1524]:
text_clf_b.fit(X_train, y_train)

In [1525]:
text_clf_b.score(X_train, y_train)

0.7140625

In [1526]:
text_clf_b.score(X_test, y_test)

0.6625

In [1527]:
# Form a prediction set
predictions = text_clf_b.predict(X_test)

In [1528]:
### Print a classification report
print("Bagging:")
print("\n")
print(metrics.classification_report(y_test,predictions))

Bagging:


              precision    recall  f1-score   support

           0       0.65      0.32      0.43        63
           1       0.67      0.89      0.76        97

    accuracy                           0.66       160
   macro avg       0.66      0.60      0.59       160
weighted avg       0.66      0.66      0.63       160

