In [237]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cosine_sim
import numpy as np
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

In [238]:
df = pd.read_csv("train_stances_working.csv") 
y = df['Stance']
df.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's..."


In [239]:
def combine_text(x):
    res = x['Headline'] + ' ' + x['articleBody']
    return res

In [240]:
df["all_text"] = list(df.apply(combine_text, axis=1))

In [241]:
df.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,all_text
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,Police find mass graves with at least '15 bodi...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...,Hundreds of Palestinians flee floods in Gaza a...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...,"Christian Bale passes on role of Steve Jobs, a..."
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...,HBO and Apple in Talks for $15/Month Apple TV ...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's...",Spider burrowed through tourist's stomach and ...


#### Preprocess headline, body and all text

In [242]:
import re
stop_words = list(set(stopwords.words('english')))
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [243]:
df['all_text_processed'] = df['all_text'].apply(lambda x: pre_process(x))
df['all_headlines_processed'] = df['Headline'].apply(lambda x: pre_process(x))
df['all_articles_processed'] = df['articleBody'].apply(lambda x: pre_process(x))


In [244]:
df.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,all_text,all_text_processed,all_headlines_processed,all_articles_processed
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,Police find mass graves with at least '15 bodi...,police find mass graves with at least bodies n...,police find mass graves with at least bodies n...,danny boyle is directing the untitled film set...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...,Hundreds of Palestinians flee floods in Gaza a...,hundreds of palestinians flee floods in gaza a...,hundreds of palestinians flee floods in gaza a...,hundreds of palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...,"Christian Bale passes on role of Steve Jobs, a...",christian bale passes on role of steve jobs ac...,christian bale passes on role of steve jobs ac...,year old moscow resident was hospitalized wit...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...,HBO and Apple in Talks for $15/Month Apple TV ...,hbo and apple in talks for month apple tv stre...,hbo and apple in talks for month apple tv stre...,reuters a canadian soldier was shot at the ca...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's...",Spider burrowed through tourist's stomach and ...,spider burrowed through tourist s stomach and ...,spider burrowed through tourist s stomach and ...,fear not arachnophobes the story of bunbury s ...


In [245]:
#fit a TfidfVectorizer on the concatenated strings
vec = TfidfVectorizer(ngram_range=(1, 3), max_df=0.8, min_df=2,  stop_words='english')
vec.fit(df["all_text"]) # Tf-idf calculated on the combined training + test set
vocabulary = vec.vocabulary_

In [246]:
vecH = TfidfVectorizer(ngram_range=(1, 3), max_df=0.8, min_df=2, vocabulary=vocabulary)

In [247]:
vecB = TfidfVectorizer(ngram_range=(1, 3), max_df=0.8, min_df=2, vocabulary=vocabulary)

In [248]:
df.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,all_text,all_text_processed,all_headlines_processed,all_articles_processed
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,Police find mass graves with at least '15 bodi...,police find mass graves with at least bodies n...,police find mass graves with at least bodies n...,danny boyle is directing the untitled film set...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...,Hundreds of Palestinians flee floods in Gaza a...,hundreds of palestinians flee floods in gaza a...,hundreds of palestinians flee floods in gaza a...,hundreds of palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...,"Christian Bale passes on role of Steve Jobs, a...",christian bale passes on role of steve jobs ac...,christian bale passes on role of steve jobs ac...,year old moscow resident was hospitalized wit...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...,HBO and Apple in Talks for $15/Month Apple TV ...,hbo and apple in talks for month apple tv stre...,hbo and apple in talks for month apple tv stre...,reuters a canadian soldier was shot at the ca...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's...",Spider burrowed through tourist's stomach and ...,spider burrowed through tourist s stomach and ...,spider burrowed through tourist s stomach and ...,fear not arachnophobes the story of bunbury s ...


In [249]:
xHeadlineTfidf=vecH.fit_transform(df['all_headlines_processed'])
xBodyTfidf=vecB.fit_transform(df['all_articles_processed'])

In [250]:
simTfidf3 = list(map(cosine_sim, xHeadlineTfidf, xBodyTfidf))

In [None]:
for index,row in df.iterrows():
    df.loc[index,"CosineSim"] = simTfidf3[index][0][0]


In [235]:
df.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,all_text,all_text_processed,all_headlines_processed,all_articles_processed,CosineSim
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,Police find mass graves with at least '15 bodi...,police find mass graves with at least bodies n...,police find mass graves with at least bodies n...,danny boyle is directing the untitled film set...,0.0
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...,Hundreds of Palestinians flee floods in Gaza a...,hundreds of palestinians flee floods in gaza a...,hundreds of palestinians flee floods in gaza a...,hundreds of palestinians were evacuated from t...,0.258252
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...,"Christian Bale passes on role of Steve Jobs, a...",christian bale passes on role of steve jobs ac...,christian bale passes on role of steve jobs ac...,year old moscow resident was hospitalized wit...,0.0
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...,HBO and Apple in Talks for $15/Month Apple TV ...,hbo and apple in talks for month apple tv stre...,hbo and apple in talks for month apple tv stre...,reuters a canadian soldier was shot at the ca...,0.0
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's...",Spider burrowed through tourist's stomach and ...,spider burrowed through tourist s stomach and ...,spider burrowed through tourist s stomach and ...,fear not arachnophobes the story of bunbury s ...,0.118469


In [236]:
header = ["Headline", "articleBody", "Stance", "CosineSim"]
df.to_csv('output.csv', columns = header)