# Classification of Facts and Opinions
---


In [1]:
import pandas as pd

In [2]:
fact = pd.read_csv("FactData.csv")
opinion = pd.read_csv("OpinionData.csv")

In [3]:
fact['CATEGORY'] = 'fact'
fact.head()

Unnamed: 0,ID,HEADLINE,STORY,CATEGORY
0,1,‘Delhi CM does not speak for India’: Centre re...,"The Ministry of External Affairs (MEA), too, t...",fact
1,2,"Adhikari, Mukul Roy: CBI chargesheet seeks nod...",The application for prosecution of Adhikari an...,fact
2,3,IFFCO CEO among those booked for importing fer...,The agency Wednesday conducted searches at 12 ...,fact
3,4,Bombay HC pulls up Centre for hesitancy in ado...,A division bench of Chief Justice Dipankar Dat...,fact
4,5,"Amid flak over fertiliser price rise, Centre h...",The increase in subsidy for DAP fertilisers wi...,fact


In [4]:
opinion['CATEGORY'] = 'opinion'
opinion.head()

Unnamed: 0,ID,HEADLINE,STORY,CATEGORY
0,1,"Adhikari, Mukul Roy: CBI chargesheet seeks nod...",The application for prosecution of Adhikari an...,opinion
1,2,IFFCO CEO among those booked for importing fer...,The agency Wednesday conducted searches at 12 ...,opinion
2,3,Bombay HC pulls up Centre for hesitancy in ado...,A division bench of Chief Justice Dipankar Dat...,opinion
3,4,"Amid flak over fertiliser price rise, Centre h...",The increase in subsidy for DAP fertilisers wi...,opinion
4,5,Coimbatore temple consecrates ‘Corona Devi’ id...,"Anand Bharathi, the manager of Kamatchipuri Ad...",opinion


In [5]:
df = pd.concat([opinion,fact])
df = df.sample(frac=1).reset_index(drop=True)
df.head(20)

Unnamed: 0,ID,HEADLINE,STORY,CATEGORY
0,6241,It takes two,Opposition’s demand for sending bills to parli...,opinion
1,1199,Downloading a debt trap,How unregulated lending apps preyed on financi...,opinion
2,1924,WhatsApp working on password-protected chats b...,The screenshots posted reveal how the chat bac...,fact
3,373,Seeking ‘discharge’ against framing of charges...,A three-judge bench headed by Chief Justice N ...,fact
4,2054,Faster in IPL,"Pace has been all the rage this season, changi...",opinion
5,463,"April 3, 1981, Forty Years Ago: Hope In Gujarat",This is the front page of The Indian Express p...,opinion
6,4623,Skincare tips: Use these three ingredients to ...,"""Whether the cause is genetics or the environm...",fact
7,3154,Kangana Ranaut shares Covid-19 negative report...,Sharing her negative Covid-19 report on her In...,fact
8,1954,"Forty Years Ago, November 3, 1980: Army pumps oil","The Army was firmly on the Assam scene, facili...",opinion
9,3335,Moth and flame,Sushant Singh Rajput managed not only to break...,opinion


In [6]:
df.shape

(13710, 4)

## Cleaning headlines

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13710 entries, 0 to 13709
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        13710 non-null  int64 
 1   HEADLINE  13710 non-null  object
 2   STORY     13710 non-null  object
 3   CATEGORY  13710 non-null  object
dtypes: int64(1), object(3)
memory usage: 428.6+ KB


In [8]:
df.isnull().sum()

ID          0
HEADLINE    0
STORY       0
CATEGORY    0
dtype: int64

In [9]:
import nltk
special_char = ['(<br/>)','(<a).*(>).*(</a>)','(&amp)','(&gt)','(&lt)','(\xa0)']

In [10]:
#LOWERCASE CONVERT
df['STORY'] = df['STORY'].apply(lambda x: x.lower())
df['HEADLINE'] = df['HEADLINE'].apply(lambda x: x.lower())

In [11]:
stopwords = nltk.corpus.stopwords.words('english')

In [12]:
df['STORY'] = df['STORY'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))
df['HEADLINE'] = df['HEADLINE'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))

In [13]:
# Remove Rare words
#Firstly finding frequency of each word
freq = pd.Series(" ".join(df['STORY']).split()).value_counts()
less_freq = list(freq[freq==1].index)

In [14]:
df['STORY'] = df['STORY'].apply(lambda x: " ".join(x for x in x.split() if x not in less_freq))

In [16]:
df['ID']=df['ID'].index

In [17]:
df

Unnamed: 0,ID,HEADLINE,STORY,CATEGORY
0,0,takes two,opposition’s demand sending bills parliamentar...,opinion
1,1,downloading debt trap,unregulated lending apps financial uncertainty...,opinion
2,2,whatsapp working password-protected chats back...,screenshots posted reveal chat backup work wha...,fact
3,3,seeking ‘discharge’ framing charges valuable r...,three-judge bench headed chief justice n v ram...,fact
4,4,faster ipl,"pace rage season, changing games, mocking patt...",opinion
...,...,...,...,...
13705,13705,"india opened prematurely, dr fauci tells us se...",india severely affected unprecedented second w...,fact
13706,13706,day goa prescribes ivermectin prophylaxis covi...,"march, too, recommended drug ivermectin used w...",fact
13707,13707,fifa takes point implementing newer ‘super’ pr...,fifa club world cup expanded relatively low-ke...,fact
13708,13708,house must sit,echo chambers good solutions. parliament open ...,opinion


In [18]:
# Again punctuation removal
df['HEADLINE'] = df['HEADLINE'].str.replace('[^\w\s]', '')
df['STORY'] = df['STORY'].str.replace('[^\w\s]', '')

In [21]:
df2 = df['HEADLINE'].apply(lambda x: len(x))

In [26]:
df2.describe()

count    13710.000000
mean        51.905398
std         22.742297
min          0.000000
25%         39.000000
50%         55.000000
75%         67.000000
max        142.000000
Name: HEADLINE, dtype: float64

In [19]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [28]:
df_final = df[['HEADLINE','CATEGORY']]
df_final

Unnamed: 0,HEADLINE,CATEGORY
0,takes two,opinion
1,downloading debt trap,opinion
2,whatsapp working passwordprotected chats backu...,fact
3,seeking discharge framing charges valuable rig...,fact
4,faster ipl,opinion
...,...,...
13705,india opened prematurely dr fauci tells us sen...,fact
13706,day goa prescribes ivermectin prophylaxis covi...,fact
13707,fifa takes point implementing newer super proj...,fact
13708,house must sit,opinion


In [29]:
train, test = train_test_split(df,test_size=0.2,random_state=42,shuffle=True)

In [30]:
train

Unnamed: 0,ID,HEADLINE,STORY,CATEGORY
9124,9124,right redress,statutory backing msp makes economic sense far...,opinion
6713,6713,arranging oxygen distributing food needy karna...,police officers layout police station bengalur...,fact
11446,11446,mindless development could bring calamities li...,people want risk homes fields forests rivers n...,opinion
11947,11947,government runs huge deficit tackle crisis ask...,india come present crisis little damage possib...,opinion
11756,11756,pitted oneplus 9 pros camera apples iphone 12 ...,oneplus 9 pro apple iphone 12 pro max offer be...,fact
...,...,...,...,...
5191,5191,scottish nationalists vow independence vote el...,first minister nicola said result meant would ...,fact
13418,13418,jiofiber offering extra 30 days validity annua...,jiofiber plans 2021 buying 100mbps annual broa...,fact
5390,5390,rulebook alone,regime laws enough deal pollution civil societ...,opinion
860,860,malayalam screenwriterdirector dennis joseph p...,malayalam screenwriterdirector dennis joseph p...,fact


In [31]:
test

Unnamed: 0,ID,HEADLINE,STORY,CATEGORY
7592,7592,kiren rijiju tests positive several leaders in...,union minister state youth affairs sports mini...,fact
10974,10974,transfer saga big names looking change clubs,mbappe kane big names changing clubs could th...,fact
13272,13272,expect china intensify pressure campaign tsai ...,question consider going forward whether beijin...,opinion
12659,12659,whos afraid song,needs embrace students protested caa singing f...,opinion
2117,2117,refresh hibiscus flowers colourful millet sala...,get ready summers two easy,fact
...,...,...,...,...
17,17,like wind,dutee chand shows ability leave everyone else ...,opinion
7518,7518,forty years ago april 22 1981 uspak deal,united states offered pakistan accepted fiveye...,opinion
9418,9418,70 pm modi come represent spirit times,imbibed deep insights human nature oxford indi...,opinion
1883,1883,suriyastarrer soorarai pottru selected shangha...,produced suriya banner soorarai pottru release...,fact


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, use_idf = True)

train_vectors = vectorizer.fit_transform(train['HEADLINE'])
test_vectors = vectorizer.transform(test['HEADLINE'])

In [38]:
import time
from sklearn import svm
from sklearn.metrics import classification_report

classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train['CATEGORY'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(test['CATEGORY'], prediction_linear, output_dict=True)
print('opinion: ', report['opinion'])
print('fact: ', report['fact'])

Training time: 14.721190s; Prediction time: 2.293276s
opinion:  {'precision': 0.8708730741012473, 'recall': 0.8965256797583081, 'f1-score': 0.8835132117603275, 'support': 1324}
fact:  {'precision': 0.9006526468455403, 'recall': 0.8758815232722144, 'f1-score': 0.8880943868430461, 'support': 1418}


In [53]:
review = "Citizens don't pay you salaries to misbehave: Sumeet Vyas to Collectors, SDMs"
review_vector = vectorizer.transform([review])
print(classifier_linear.predict(review_vector))

['fact']


In [41]:
prediction_linear

array(['fact', 'fact', 'opinion', ..., 'opinion', 'fact', 'fact'],
      dtype=object)

In [45]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test['CATEGORY'],prediction_linear)

array([[1242,  176],
       [ 137, 1187]], dtype=int64)