# Logistic regression

In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\data_cleaned_v1.csv')
df

Unnamed: 0,index,text,label
0,0,These girlfriends deserves a special mention f...,0
1,1,LeSean McCoy going through warmups with first ...,0
2,2,Tom Curran has been called up to England's Ash...,0
3,3,"We'll have turkey on the table Thursday but, a...",0
4,4,The 1945 Sinkings of the Cap Arcona and the Th...,0
...,...,...,...
499932,499995,There are a lot of things that I don't like ab...,1
499933,499996,A year after an unprecedented public outcry ag...,1
499934,499997,Battles Between the English and the Scots\n\nT...,1
499935,499998,Kurt Rambis is the new head coach of the Knick...,1


In [4]:
df = df.sample(frac=1, , random_state=78735).reset_index(drop=True)
df

Unnamed: 0,index,text,label
0,405715,T-Mobile USA CEO John Legere has reportedly to...,1
1,340379,The idea is that our brains do not allow us to...,1
2,42368,"If you ask Wall Street, Twitter is in trouble....",0
3,322170,This is an open-access article distributed und...,1
4,21715,Meet Dr. Waisath\n\nDr. Waisath completed his ...,0
...,...,...,...
499932,124415,CHICAGO (CBS) — Republican Illinois Sen. Mark ...,0
499933,367501,MEXICO CITY — Mexican President Enrique Peña N...,1
499934,347317,As part of our ongoing effort to improve the r...,1
499935,256796,The number of illegal immigrants in Massachuse...,1


In [5]:
vectorizer = TfidfVectorizer()
Tfidf = vectorizer.fit_transform(df['text'])

In [6]:
Tfidf

<499937x875483 sparse matrix of type '<class 'numpy.float64'>'
	with 106916197 stored elements in Compressed Sparse Row format>

## Define models

In [7]:
model_LR = LogisticRegression(solver='liblinear')
model_LR.fit(Tfidf, df['label'])

In [9]:
model_SVM = LinearSVC()
model_SVM.fit(Tfidf, df['label'])

In [None]:
model_RF = RandomForestClassifier()
model_RF.fit(Tfidf, df['label'])

## Testing the model

In [16]:
df_test = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\data_test_v1.csv')

In [17]:
df_test = df_test.sample(frac=1, random_state=96202).reset_index(drop=True)

In [18]:
df_test

Unnamed: 0,text,label
0,A spoof of typical 1980's teen movies. The ide...,0
1,I was a victim of a false rape report by a fel...,1
2,An artist's impression of the planet Kepler 46...,1
3,During a live stream interview conducted by Dr...,0
4,"""You guys have this thing for them,"" Mr. Trump...",1
...,...,...
9995,"RICHMOND, Va. – Police responding to a report ...",1
9996,New Delhi : A day after the Narendra Modi gove...,1
9997,"""'Babysitting' scores points for sustaining a ...",0
9998,"After five years of being unemployed, a 27-yea...",1


In [19]:
Tfidf_test = vectorizer.transform(df_test['text'])

In [21]:
model_LR.score(Tfidf_test, df_test['label'])

0.8806

In [22]:
model_SVM.score(Tfidf_test, df_test['label'])

0.8992

## Use model on news data

## Financial news

In [10]:
df_finance_news = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\news\finance_processed.csv')

In [11]:
Tfidf_finance = vectorizer.transform(df_finance_news['summary'])
Tfidf_finance

<2607x875540 sparse matrix of type '<class 'numpy.float64'>'
	with 161804 stored elements in Compressed Sparse Row format>

In [12]:
pred_finance = model.predict_proba(Tfidf_finance)

In [13]:
pred_finance = pd.DataFrame(pred_finance)

In [14]:
pred_finance[1].mean()

0.23439852232059874

In [15]:
pred_finance.describe()

Unnamed: 0,0,1
count,2607.0,2607.0
mean,0.765601,0.234399
std,0.202719,0.202719
min,0.015592,0.001647
25%,0.664529,0.081271
50%,0.831888,0.168112
75%,0.918729,0.335471
max,0.998353,0.984408


## sports news

In [16]:
df_sport_news = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\news\sport_processed.csv')

In [17]:
Tfidf_sport = vectorizer.transform(df_sport_news['summary'])
Tfidf_sport

<2981x875540 sparse matrix of type '<class 'numpy.float64'>'
	with 189785 stored elements in Compressed Sparse Row format>

In [18]:
pred_sport = model.predict_proba(Tfidf_sport)

In [19]:
pred_sport = pd.DataFrame(pred_sport)

In [20]:
pred_sport.describe()

Unnamed: 0,0,1
count,2981.0,2981.0
mean,0.779206,0.220794
std,0.214143,0.214143
min,0.003625,0.001242
25%,0.688356,0.062787
50%,0.858549,0.141451
75%,0.937213,0.311644
max,0.998758,0.996375


## Election news

In [21]:
df_election_news = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\news\election_processed.csv')

In [22]:
Tfidf_election = vectorizer.transform(df_election_news['summary'])
Tfidf_election

<3053x875540 sparse matrix of type '<class 'numpy.float64'>'
	with 193227 stored elements in Compressed Sparse Row format>

In [23]:
pred_election = model.predict_proba(Tfidf_election)

In [24]:
pred_election = pd.DataFrame(pred_election)

In [25]:
election_highprob = list(pred_election[1].sort_values(ascending=False).iloc[0:10].index)

In [26]:
highprob_news = df_election_news.loc[election_highprob, 'summary']

In [27]:
highprob_news.iloc[2]

"Labour is not gaining the seats that it would need to form a government at the next general election, a minister has said. Transport minister Richard Holden told the  BBC 's Elections 2023 programme: 'The truth is the seats that we need to win at a general election right across the country, you are just not seeing those Labour gains that they would need to do in order to form a government at the next general election at this stage.' Asked about the latest projections, Mr Holden said people have had a 'tough time' with the cost of living, adding: 'The Government has had to put a huge package in there, but people are still feeling it."

In [43]:
pred_election.describe()

Unnamed: 0,0,1
count,3053.0,3053.0
mean,0.758453,0.241547
std,0.204752,0.204752
min,0.014242,0.001338
25%,0.655025,0.082119
50%,0.819991,0.180009
75%,0.917881,0.344975
max,0.998662,0.985758
