## NIREN PATEL IRDM STANCE DETECTION BOOK 1

In [0]:
#!kill -9 -1 # Kill Switch

In [1]:
!ls

datalab


In [0]:
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#2. Get the file
train_bodies = drive.CreateFile({'id':'13WyDTTLFGRsTAmnR5mzDv--I_NuUN4iV'}) 
train_bodies.GetContentFile('train_bodies.csv')  

train_stances = drive.CreateFile({'id':'1on-V4uX1F05cu_57-NJr0V6T4swCychT'}) 
train_stances.GetContentFile('train_stances.csv')


test_stances = drive.CreateFile({'id':'16viDyAVz6blVGSPfHpxE3_m-ublWpC1a'})
test_stances.GetContentFile('test_stances_unlabeled.csv')

test_bodies = drive.CreateFile({'id':'1wl6TzYe1a_ipv7X8g3yS9d3p2U_2E1OK'}) 
test_bodies.GetContentFile('test_bodies.csv')

In [0]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
import math
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
nltk.download("wordnet", "nltk_data/")

[nltk_data] Downloading package wordnet to nltk_data/...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
#Read file as panda dataframe
train_stances = pd.read_csv('train_stances.csv')
train_bodies = pd.read_csv('train_bodies.csv')
test_stances = pd.read_csv('test_stances_unlabeled.csv')
test_bodies = pd.read_csv('test_bodies.csv')

In [0]:
mergedTrain1 = pd.merge(left=train_stances,right=train_bodies, how='left', left_on='Body ID', right_on='Body ID')
mergedTest = pd.merge(left=test_stances,right=test_bodies, how='left', left_on='Body ID', right_on='Body ID')

# SPLIT TRAINING DATA

In [0]:
#Split TrainingData into Train and Validation Set. # RANDOM STATE FOR REPRODUCIBILITY
mergedTrain = mergedTrain1.sample(frac=0.9,weights=None, random_state=123)
mergedValidation = mergedTrain1.drop(mergedTrain.index)


In [0]:
#RATIO STATISTICS TRAIN
mergedTrain.groupby('Stance').size() * 100 / len(mergedTrain)

Stance
agree         7.395220
disagree      1.658699
discuss      17.823235
unrelated    73.122846
dtype: float64

In [0]:
#RATIO STATISTICS VALIDATION
mergedValidation.groupby('Stance').size() * 100 / len(mergedValidation)

Stance
agree         7.044227
disagree      1.881129
discuss      17.870722
unrelated    73.203922
dtype: float64

In [0]:
#Make all words lowercase
lowerMergedTrain = mergedTrain.apply(lambda x: x.str.lower() if(x.dtype == 'object') else x)
lowerMergedTest = mergedTest.apply(lambda x: x.str.lower() if(x.dtype == 'object') else x)
lowerMergedValidation = mergedValidation.apply(lambda x: x.str.lower() if(x.dtype == 'object') else x)

In [0]:
#lowerMergedTrain.head()

In [0]:
#lowerMergedTest.head()

# ORIGINAL VECTOR REPRESENTATION, COSINE SIMILARITY AND EUCLIDIAN DISTANCE

In [0]:
#Get words in correct form
WORD = re.compile(r'\w+')

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

In [0]:
def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator


In [0]:
def get_euclidian(vec1, vec2):
  euclidian = math.sqrt(sum((vec1.get(k, 0) - vec2.get(k, 0))**2 for k in set(vec1.keys()).union(set(vec2.keys()))))
  return euclidian

In [0]:
#TEST:

#first =  "The Quick brown fox  jumps over the Lazy black dog"
#second = "Big piece"
#vector1 = text_to_vector(first)
#vector2 = text_to_vector(second)

#cosine = get_cosine(vector1, vector2)

#print('Cosine:', cosine)
#print(vector1)
#print(vector2)

In [0]:
#Vector Formation
#Vectors without words are automatically zero
vectorTrain = lowerMergedTrain.merge(lowerMergedTrain.Headline.apply(lambda s: pd.Series({'HeadlineVector':text_to_vector(s)})), left_index=True, right_index=True)
vectorTrain = vectorTrain.merge(vectorTrain.articleBody.apply(lambda x: pd.Series({'bodyVector':text_to_vector(x)})), left_index=True, right_index=True)

vectorTest = lowerMergedTest.merge(lowerMergedTest.Headline.apply(lambda s: pd.Series({'HeadlineVector':text_to_vector(s)})), left_index=True, right_index=True)
vectorTest = vectorTest.merge(vectorTest.articleBody.apply(lambda x: pd.Series({'bodyVector':text_to_vector(x)})), left_index=True, right_index=True)

vectorValidation = lowerMergedValidation.merge(lowerMergedValidation.Headline.apply(lambda s: pd.Series({'HeadlineVector':text_to_vector(s)})), left_index=True, right_index=True)
vectorValidation = vectorValidation.merge(vectorValidation.articleBody.apply(lambda x: pd.Series({'bodyVector':text_to_vector(x)})), left_index=True, right_index=True)

In [0]:
# GET THE COSINE SIMILARITY
#Vectors without words are automatically zero
vectorTrain['CosineSim'] = vectorTrain.apply(lambda x: get_cosine(x['HeadlineVector'], x['bodyVector']), axis=1)
vectorTest['CosineSim'] = vectorTest.apply(lambda x: get_cosine(x['HeadlineVector'], x['bodyVector']), axis=1)
vectorValidation['CosineSim'] = vectorValidation.apply(lambda x: get_cosine(x['HeadlineVector'], x['bodyVector']), axis=1)


In [0]:
# GET THE EUCLIDIAN DISTANCE
vectorTrain['Euclidian Distance'] = vectorTrain.apply(lambda x: get_euclidian(x['HeadlineVector'], x['bodyVector']), axis=1)
vectorTest['Euclidian Distance'] = vectorTest.apply(lambda x: get_euclidian(x['HeadlineVector'], x['bodyVector']), axis=1)
vectorValidation['Euclidian Distance'] = vectorValidation.apply(lambda x: get_euclidian(x['HeadlineVector'], x['bodyVector']), axis=1)


In [0]:
vectorTrain.groupby('Stance').mean()

Unnamed: 0_level_0,Body ID,CosineSim,Euclidian Distance
Stance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
agree,1625.703548,0.253234,38.880112
disagree,1621.730563,0.242867,40.629463
discuss,1649.028318,0.243228,45.225419
unrelated,1269.826649,0.097974,41.646969


In [0]:
vectorValidation.groupby('Stance').mean()

Unnamed: 0_level_0,Body ID,CosineSim,Euclidian Distance
Stance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
agree,1573.784091,0.258272,40.115902
disagree,1722.893617,0.242656,44.935701
discuss,1615.492721,0.238314,45.687095
unrelated,1235.5462,0.098505,42.424776


In [0]:
vectorTrain =  vectorTrain.drop(columns = ['Euclidian Distance'], axis=1)
vectorTest = vectorTest.drop(columns = ['Euclidian Distance'], axis=1)
vectorValidation = vectorValidation.drop(columns = ['Euclidian Distance'], axis=1)

# VECTOR REPRESENTATION, COSINE SIMILARITY AND EUCLIDIAN DISTANCE WITH STOPWORD REMOVAL

In [0]:
#STOPWORD REMOVAL

stop = stopwords.words('english')

In [0]:
#TAKES A LONG TIME
for i in stop :
    lowerMergedValidation['Headline'] = lowerMergedValidation['Headline'].replace(to_replace=r'\b%s\b'%i, value="",regex=True)
    lowerMergedValidation['articleBody'] = lowerMergedValidation['articleBody'].replace(to_replace=r'\b%s\b'%i, value="",regex=True)
    lowerMergedTrain['Headline'] = lowerMergedTrain['Headline'].replace(to_replace=r'\b%s\b'%i, value="",regex=True)
    lowerMergedTrain['articleBody'] = lowerMergedTrain['articleBody'].replace(to_replace=r'\b%s\b'%i, value="",regex=True)
    lowerMergedTest['Headline'] = lowerMergedTest['Headline'].replace(to_replace=r'\b%s\b'%i, value="",regex=True)
    lowerMergedTest['articleBody'] = lowerMergedTest['articleBody'].replace(to_replace=r'\b%s\b'%i, value="",regex=True)


In [0]:
#Vector Formation
vectorTrain = lowerMergedTrain.merge(lowerMergedTrain.Headline.apply(lambda s: pd.Series({'HeadlineVector':text_to_vector(s)})), left_index=True, right_index=True)
vectorTrain = vectorTrain.merge(vectorTrain.articleBody.apply(lambda x: pd.Series({'bodyVector':text_to_vector(x)})), left_index=True, right_index=True)

vectorTest = lowerMergedTest.merge(lowerMergedTest.Headline.apply(lambda s: pd.Series({'HeadlineVector':text_to_vector(s)})), left_index=True, right_index=True)
vectorTest = vectorTest.merge(vectorTest.articleBody.apply(lambda x: pd.Series({'bodyVector':text_to_vector(x)})), left_index=True, right_index=True)

vectorValidation = lowerMergedValidation.merge(lowerMergedValidation.Headline.apply(lambda s: pd.Series({'HeadlineVector':text_to_vector(s)})), left_index=True, right_index=True)
vectorValidation = vectorValidation.merge(vectorValidation.articleBody.apply(lambda x: pd.Series({'bodyVector':text_to_vector(x)})), left_index=True, right_index=True)

In [0]:
# GET THE COSINE SIMILARITY
vectorTrain['CosineSim'] = vectorTrain.apply(lambda x: get_cosine(x['HeadlineVector'], x['bodyVector']), axis=1)
vectorTest['CosineSim'] = vectorTest.apply(lambda x: get_cosine(x['HeadlineVector'], x['bodyVector']), axis=1)
vectorValidation['CosineSim'] = vectorValidation.apply(lambda x: get_cosine(x['HeadlineVector'], x['bodyVector']), axis=1)

In [0]:
# GET THE EUCLIDIAN DISTANCE
vectorTrain['Euclidian Distance'] = vectorTrain.apply(lambda x: get_euclidian(x['HeadlineVector'], x['bodyVector']), axis=1)
vectorTest['Euclidian Distance'] = vectorTest.apply(lambda x: get_euclidian(x['HeadlineVector'], x['bodyVector']), axis=1)
vectorValidation['Euclidian Distance'] = vectorValidation.apply(lambda x: get_euclidian(x['HeadlineVector'], x['bodyVector']), axis=1)


In [0]:
vectorTrain.groupby('Stance').mean()

Unnamed: 0_level_0,Body ID,CosineSim,Euclidian Distance
Stance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
agree,1625.703548,0.294215,18.593079
disagree,1621.730563,0.27586,19.864169
discuss,1649.028318,0.295782,21.381552
unrelated,1269.826649,0.015391,20.987121


In [0]:
vectorValidation.groupby('Stance').mean()


Unnamed: 0_level_0,Body ID,CosineSim,Euclidian Distance
Stance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
agree,1573.784091,0.302866,19.111854
disagree,1722.893617,0.272828,21.38845
discuss,1615.492721,0.296242,21.237568
unrelated,1235.5462,0.01474,21.440879


In [0]:
vectorTrain =  vectorTrain.drop(columns = ['Euclidian Distance'], axis=1)
vectorTest = vectorTest.drop(columns = ['Euclidian Distance'], axis=1)
vectorValidation = vectorValidation.drop(columns = ['Euclidian Distance'], axis=1)

In [0]:
#vectorValidation = vectorValidation.drop(['bodyVector','HeadlineVector'], axis = 1)
#vectorTrain = vectorTrain.drop(['bodyVector','HeadlineVector'], axis = 1)
#vectorTest = vectorTest.drop(['bodyVector','HeadlineVector'], axis = 1)

In [0]:
#ONLY WORKS IN CHROME  #Code to save files for next stage
#ValidationProcessed = pd.DataFrame(vectorValidation).to_csv('ValidationProcessed.csv')
#TrainProcessed = pd.DataFrame(vectorTrain).to_csv('TrainProcessed.csv')
#TestProcessed = pd.DataFrame(vectorTest).to_csv('TestProcessed.csv')

In [0]:
#Code to save files for next stage from google.colab import files

#from google.colab import files
#files.download('ValidationProcessed.csv')

In [0]:
#Code to save files for next stage from google.colab import files

#files.download('TrainProcessed.csv')

In [0]:
#Code to save files for next stage from google.colab import files

#files.download('TestProcessed.csv')

# VECTOR REPRESENTATION AND COSINE SIMILARITY WITH STOPWORD REMOVAL AND STEMMING

In [0]:
#############################################################################
#############################################################################
#############################################################################
#############################################################################
#############################################################################

###UNCOMMENT THE FOLLOWING CELLS FOR STEMMING RESULTS

In [0]:
#STEMMING:
#stemmer = PorterStemmer()

In [0]:
#lowerMergedValidation['Headline'] = lowerMergedValidation['Headline'].apply(lambda x: stemmer.stem(x))
#lowerMergedValidation['articleBody'] = lowerMergedValidation['articleBody'].apply(lambda x: stemmer.stem(x))
#lowerMergedTrain['Headline'] = lowerMergedTrain['Headline'].apply(lambda x: stemmer.stem(x))
#lowerMergedTrain['articleBody'] = lowerMergedTrain['articleBody'].apply(lambda x: stemmer.stem(x))
#lowerMergedTest['Headline'] = lowerMergedTest['Headline'].apply(lambda x: stemmer.stem(x))
#lowerMergedTest['articleBody'] = lowerMergedTest['articleBody'].apply(lambda x: stemmer.stem(x))

In [0]:
#Vector Formation
#vectorTrain = lowerMergedTrain.merge(lowerMergedTrain.Headline.apply(lambda s: pd.Series({'HeadlineVector':text_to_vector(s)})), left_index=True, right_index=True)
#vectorTrain = vectorTrain.merge(vectorTrain.articleBody.apply(lambda x: pd.Series({'bodyVector':text_to_vector(x)})), left_index=True, right_index=True)

#vectorTest = lowerMergedTest.merge(lowerMergedTest.Headline.apply(lambda s: pd.Series({'HeadlineVector':text_to_vector(s)})), left_index=True, right_index=True)
#vectorTest = vectorTest.merge(vectorTest.articleBody.apply(lambda x: pd.Series({'bodyVector':text_to_vector(x)})), left_index=True, right_index=True)

#vectorValidation = lowerMergedValidation.merge(lowerMergedValidation.Headline.apply(lambda s: pd.Series({'HeadlineVector':text_to_vector(s)})), left_index=True, right_index=True)
#vectorValidation = vectorValidation.merge(vectorValidation.articleBody.apply(lambda x: pd.Series({'bodyVector':text_to_vector(x)})), left_index=True, right_index=True)

In [0]:
# GET THE COSINE SIMILARITY

#vectorTrain['CosineSim'] = vectorTrain.apply(lambda x: get_cosine(x['HeadlineVector'], x['bodyVector']), axis=1)
#vectorTest['CosineSim'] = vectorTest.apply(lambda x: get_cosine(x['HeadlineVector'], x['bodyVector']), axis=1)
#vectorValidation['CosineSim'] = vectorValidation.apply(lambda x: get_cosine(x['HeadlineVector'], x['bodyVector']), axis=1)

In [0]:
#vectorTrain.groupby('Stance').mean()

Unnamed: 0_level_0,Body ID,CosineSim
Stance,Unnamed: 1_level_1,Unnamed: 2_level_1
agree,1625.703548,0.279627
disagree,1621.730563,0.261399
discuss,1649.028318,0.282045
unrelated,1269.826649,0.014965


In [0]:
#vectorValidation.groupby('Stance').mean()

Unnamed: 0_level_0,Body ID,CosineSim
Stance,Unnamed: 1_level_1,Unnamed: 2_level_1
agree,1573.784091,0.2853
disagree,1722.893617,0.261246
discuss,1615.492721,0.282476
unrelated,1235.5462,0.014201
