In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import string
import math
from scipy.sparse import lil_matrix 
from scipy.sparse.linalg import norm
import timeit

import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy.sparse import *

In [2]:
train_bodies=pd.read_csv('train_bodies.csv')
train_stances=pd.read_csv('train_stances.csv')
test_bodies=pd.read_csv('test_bodies.csv')
test_stances_unlabeled=pd.read_csv('test_stances_unlabeled.csv')

In [3]:
train=pd.merge(train_bodies,train_stances,how='inner',left_on='Body ID',right_on='Body ID')

## Split the training set into a training subset and a validation subset with the data numberproportion about 9:1

In [4]:
X=train[['articleBody','Headline']]
y=train['Stance'] 
X_train, X_valid, y_train, y_valid = \
        train_test_split(X, y, test_size=0.1, random_state=1,stratify=y)
print ("number of instances for training: ", len(y_train))
print ("number of instances for validation: ", len(y_valid))

number of instances for training:  44974
number of instances for validation:  4998


## Statistics of the ratios

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
print('training subset ')
print('agree: disagree: discuss: unrelated =',format(counts[0]/counts[1],'0.2f'),':',counts[1]/counts[1],':',\
       format(counts[2]/counts[1],'0.2f'),':',format(counts[3]/counts[1],'0.2f'))

In [None]:
unique, counts = np.unique(y_valid, return_counts=True)
print('validation subset ')
print('agree: disagree: discuss: unrelated =',format(counts[0]/counts[1],'0.2f'),':',counts[1]/counts[1],':',\
       format(counts[2]/counts[1],'0.2f'),':',format(counts[3]/counts[1],'0.2f'))

## Extract vector representation of headlines and bodies in the all the datasets, and compute the cosine similarity between these two vectors

In [5]:
stop_words = [
        "a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along",
        "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
        "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be",
        "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
        "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "co",
        "con", "could", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight",
        "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
        "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill", "find", "fire", "first", "five", "for",
        "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had",
        "has", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
        "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest",
        "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made",
        "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much",
        "must", "my", "myself", "name", "namely", "neither", "nevertheless", "next", "nine", "nobody", "now", "nowhere",
        "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours",
        "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see",
        "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some",
        "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take",
        "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby",
        "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though",
        "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve",
        "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what",
        "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon",
        "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will",
        "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"
        ]

In [6]:
def clean(text):
    translator = str.maketrans('', '', string.punctuation)
    text=text.translate(translator).lower().split()
    return text

def bow(df,bow_size):
    wordcount={}   
    for doc in df:
        text=clean(doc)
        for word in text:                
            try:
                wordcount[word]+=1
            except KeyError:
                wordcount[word]=1
    wordcount = {key: value for key, value in wordcount.items() if key not in stop_words}
    bag_ofword=sorted(wordcount, key=wordcount.get, reverse=True)[:bow_size]        
    return bag_ofword,wordcount


def inverse(number):
    log = math.log(len(df)/(number+1))   
    return log 

def idf(df):
    doc_freq={} 
    for doc in df:
        doc=set(clean(doc))  #get unique word that not in stop word
        for word in doc:                           
            try:
                doc_freq[word]+=1
            except KeyError:
                doc_freq[word]=1
    dict_idf={k:inverse(v) for k,v in doc_freq.items() if k not in stop_words}
    return dict_idf


In [7]:
df=pd.concat([train['Headline'], train['articleBody']], ignore_index=True)
bag_of_word,word_count=bow(df,3000)

In [8]:
dict_idf=idf(df)

### vector representation 

In [9]:
tfidf=pd.DataFrame(columns=['words','idf'])

tfidf['words']=bag_of_word  #bag of words with order

tfidf['idf']=tfidf['words'].map(dict_idf)

In [10]:
def tfidf_array(doc):
    def count_tf(word):
        return doc.count(word)
    doc=clean(doc)
    return (tfidf['words'].apply(count_tf)*tfidf['idf']).values

def cos(row):
    x=tfidf_array(row['Headline'])
    y=tfidf_array(row['articleBody'])
   
    return np.dot(x,y.T)/(np.linalg.norm(x)*np.linalg.norm(y)) 

In [11]:
X_train['cos'] = X_train.apply(cos, axis=1)   #take 380s

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


give number to nan cosine? it means either norm x or norm y equals 0, which means eiter headline or body dooesn't have any word in bag of words, so maybe extent the size of it??

### language model based representations

In [14]:
collection=sum(word_count.values())


def distrib(row,u1,u2):
    headline,article=clean(row['Headline']),clean(row['articleBody'])     
    freq=pd.DataFrame() 
    freq['words']=list(set(headline+article)) #consider the word only in D and Q
    
    la,lh= len(article),len(headline)
    pram1,pram2=la/(la+u1),lh/(lh+u2) 
    
    def pro(word):
        try:
            PwC=(word_count[word])/collection
            pro=(math.log(pram1*(article.count(word))/la+(1-pram1)*PwC))*(pram2*(headline.count(word))/lh+(1-pram2)*PwC)
            return -pro
        except KeyError:   #means this is a stop word which has been removed from word count
            pass   
    return freq['words'].apply(pro).sum()

In [15]:
X_train['KL']=X_train.apply(distrib, u1=200,u2=10,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [16]:
X_train

Unnamed: 0,articleBody,Headline,cos,KL
22494,Welsh actor Christian Bale has withdrawn from ...,Michael Brown shooting audio caught on tape?,0.000000,2.946530
21286,The United States Department of Defense said o...,Soldier shot in Ottawa at War Memorial,0.000000,2.795358
19206,Seth Rogen has been set to star as Steve Wozni...,Seth Rogen to Play Apple Co-Founder Wozniak in...,0.522050,2.545280
7424,Though Apple was mum about battery life for it...,Meet The College Student Who Claims She Was Th...,0.000000,3.240192
5199,"The 45 year old Mexican/American, born in De l...",Continuing Violence Puts Boko Haram Ceasefire ...,0.000000,3.859731
45381,Seattle/ San Francisco: Google Inc. bought six...,Kim Jong-un: obese leader 'fractured both ankles',0.000000,3.630303
3819,"FORT DEVENS, Massachusetts -\n\nInfamous Bosto...",Tropical spider 'burrowed under man's skin thr...,0.000000,3.881878
49920,A Guantanamo Bay prisoner released last year a...,US Intel: Taliban Man Released From Gitmo in B...,0.323027,2.918381
16253,From Staff Reports\nAn Internet posting claimi...,CNN plays chilling audio recording allegedly f...,0.000000,4.611434
10117,您所访问的资源已不存在。\n查看更多请返回网站主页。\n» cctvnews.cn,Tips for Chinese choosing an English name,,2.220842


In [17]:
X_train['cos'].isnull().sum()

26

## explore other features

In [20]:
!pip install gensim

from gensim.models import word2vec

Collecting gensim
  Downloading gensim-3.4.0-cp35-cp35m-manylinux1_x86_64.whl (22.6MB)
[K    100% |████████████████████████████████| 22.6MB 33kB/s  eta 0:00:01  3% |█▏                              | 788kB 12.0MB/s eta 0:00:02    6% |██                              | 1.4MB 10.8MB/s eta 0:00:02
Collecting smart-open>=1.2.1 (from gensim)
  Downloading smart_open-1.5.6.tar.gz
Collecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading bz2file-0.98.tar.gz
Building wheels for collected packages: smart-open, bz2file
  Running setup.py bdist_wheel for smart-open ... [?25ldone
[?25h  Stored in directory: /home/nbuser/.cache/pip/wheels/36/48/35/97efc2bd1b233627131c9a936c9de23681846db707b907d353
  Running setup.py bdist_wheel for bz2file ... [?25ldone
[?25h  Stored in directory: /home/nbuser/.cache/pip/wheels/31/9c/20/996d65ca104cbca940b1b053299b68459391c01c774d073126
Successfully built smart-open bz2file
Installing collected packages: bz2file, smart-open, gensim
Successfully installed

In [22]:
import gensim

In [24]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

FileNotFoundError: [Errno 2] No such file or directory: 'GoogleNews-vectors-negative300.bin'

## algoritm implement

In [None]:
import random
def linear(x,y,alpha,theta,iteration):
    xTrans = x.transpose()
    thetaTrans=theta.transpose()
    for i in range(0, iteration):
        h=np.dot(x, theta)   #hypothesis
        loss=h-y        #loss
        cost = np.sum(loss ** 2) / (2 * m) #cost function
        gradient = np.dot(xTrans,loss) / m #gradient 
        theta = theta - alpha * gradient
    return theta

In [None]:
x=np.column_stack((X_train['cos'], X_train['KL']))
m,n=x.shape
x = np.c_[ np.ones(m), x] #insert a column for intetcept
theta=np.ones(3)
alpha=0.01
encoder={'disagree':3,'unrelated':1,'discuss':'2','agree':4}
y_train_linear=y_train.map(encoder)
theta = linear(x,y_train_linear, theta, alpha, 1000)
print(theta)