In [140]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import VotingClassifier



class SentAnalyzer():
    def __init__(self, sourcePath):
        self.data = pd.read_csv(sourcePath)
        self.indexes = self.data['index']
        self.labels = self.data['label']
        self.texts = self.data['text']
        self.textsL1 = self.data.loc[self.data['label'] == 1]
        self.textsL0 = self.data.loc[self.data['label'] == 0]
        
        # self.vectorizer = TfidfVectorizer()
        # self.word_count_vector = self.cv.fit_transform(self.texts)
    
    ##########
    ## SST2 ##
    ##########
    
    def checkBalance(self):
        len1 = len(self.textsL1)
        len0 = len(self.textsL0)
        print('Label 1:',len1)
        print('Label 0:',len0)
        if len1 > len0:
            print('Their ratio:',abs(len0/len1))
        else:
            print('Their ratio:',abs(len1/len0))
            

    def nGrams(self, minN,maxN):
        for i in range(minN,maxN):
            cv = CountVectorizer(ngram_range=(i,i))
            # word_count_vector = cv.fit_transform(self.texts)
            word_count_vector = cv.fit_transform(self.texts)
            # print(word_count_vector)
            
            tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
            # tfidf_transformer=TfidfTransformer()
            tfidf_transformer.fit(word_count_vector)
            
            df_idf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names(),columns=["idf_weights"])
            
            # sort ascending 
            print(df_idf.sort_values(by = ['idf_weights'])[:5])
    
    def lengthCorrelation(self):
        lengths = [(lambda x: len(x[1]))(x) for x in self.texts.items()]
        lenLab = pd.DataFrame( list(zip(lengths, self.labels)), columns = ["length","labels"])
        print(lenLab.corr())
    
    def tfIdfWithCV(self):
        word_count_vector = CountVectorizer().fit_transform(self.texts)
        print(word_count_vector)
        wc = pd.DataFrame(word_count_vector, index = cv.get_feature_names(), columns = ["idf_weights"])
        # sort ascending 
        # df_idf = pd.DataFrame(word_count_vector.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
    
    def tfIdfWithTranformer(self):
        # this steps generates word counts for the words in your docs 
        cv = CountVectorizer()
        word_count_vector = cv.fit_transform(self.texts)
        # print(word_count_vector)
        
        tfidf_transformer=TfidfTransformer(smooth_idf = True, use_idf = True) 
        tfidf_transformer.fit(word_count_vector)
        
        df_idf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names(), columns = ["idf_weights"])
        
        # sort ascending 
        # print(self.tfidf_transformer.idf_)
        print(df_idf.sort_values(by = ['idf_weights']))

    def majorityVote(self):
        estimators = []
        estimator.append(('LR', 
                  LogisticRegression(solver ='lbfgs', 
                                     multi_class ='multinomial', 
                                     max_iter = 200)))
        estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
        estimator.append(('DTC', DecisionTreeClassifier()))

    def collabEval(self):
        # Get data
        df = pd.read_csv("GlobalSheet.csv")
        
        cols = []
        for i in range(29):
            cols.append(str(i+1))

        ratings = df.drop(["Sentence Index","Ground Truth Labels"] + cols,axis=1)
        groundTruth = df["Ground Truth Labels"]
        users = ratings.columns
        
        # Start to calculate


train = SentAnalyzer("stsa.binary.phrases.train")
#instantiate CountVectorizer() 
# train.tfIdfWithTranformer()






# print idf values 

In [141]:
train.checkBalance()

Label 1: 42259
Label 0: 34702
Their ratio: 0.8211741877469888


In [142]:
# Get the n-grams
train.nGrams(1,4)

     idf_weights
the     2.210299
and     2.367068
of      2.490469
to      2.807791
is      3.125087
          idf_weights
of the       4.053253
in the       4.658701
the film     4.880207
to the       5.155174
to be        5.196617
              idf_weights
one of the       6.096209
the film is      6.918348
the kind of      7.009320
the movie is     7.086281
of the year      7.103573


In [143]:
train.lengthCorrelation()

          length    labels
length  1.000000 -0.037769
labels -0.037769  1.000000


In [144]:
import os
import json
import pandas as pd
import gzip
from urllib.request import urlopen

###################
## Amazon Review ##
###################

# Load in the Amazon Review Data with 5-core
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Software_5.json.gz')
df = df.fillna('')
print('shape', df.shape)
i = 0
for (columnName, columnData) in df.iteritems():
    i+=1
    if i > 5:
        break
    print('Colunm Name : ', columnName)
    print('Column Contents : ', columnData.values[:5])

shape (12805, 12)
Colunm Name :  overall
Column Contents :  [4. 4. 5. 5. 5.]
Colunm Name :  verified
Column Contents :  [False False False False False]
Colunm Name :  reviewTime
Column Contents :  ['10 20, 2010' '10 18, 2010' '10 16, 2010' '10 12, 2010' '10 7, 2010']
Colunm Name :  reviewerID
Column Contents :  ['A38NELQT98S4H8' 'A3QJU4FEN8PQSZ' 'ACJT8MUC0LRF0' 'AYUF7YETYOLNX'
 'A31ICLWQ9CSHRS']
Colunm Name :  asin
Column Contents :  ['0321719816' '0321719816' '0321719816' '0321719816' '0321719816']


In [149]:
pip install krippendorff

Collecting krippendorff
  Downloading krippendorff-0.4.0-py3-none-any.whl (17 kB)
Installing collected packages: krippendorff
Successfully installed krippendorff-0.4.0
Note: you may need to restart the kernel to use updated packages.


In [204]:
def nominal_metric(a, b):
    return a != b


def interval_metric(a, b):
    return (a-b)**2


def ratio_metric(a, b):
    return ((a-b)/(a+b))**2

def krippendorff_alpha(data, metric=interval_metric, force_vecmath=False, convert_items=float, missing_items=None):
    '''
    Calculate Krippendorff's alpha (inter-rater reliability):
    
    data is in the format
    [
        {unit1:value, unit2:value, ...},  # coder 1
        {unit1:value, unit3:value, ...},   # coder 2
        ...                            # more coders
    ]
    or 
    it is a sequence of (masked) sequences (list, numpy.array, numpy.ma.array, e.g.) with rows corresponding to coders and columns to items
    
    metric: function calculating the pairwise distance
    force_vecmath: force vector math for custom metrics (numpy required)
    convert_items: function for the type conversion of items (default: float)
    missing_items: indicator for missing items (default: None)
    '''
    
    # number of coders
    m = len(data)
    
    # set of constants identifying missing values
    if missing_items is None:
        maskitems = []
    else:
        maskitems = list(missing_items)
    if np is not None:
        maskitems.append(np.ma.masked_singleton)
    
    # convert input data to a dict of items
    units = {}
    for d in data:
        try:
            # try if d behaves as a dict
            diter = d.items()
        except AttributeError:
            # sequence assumed for d
            diter = enumerate(d)
            
        for it, g in diter:
            if g not in maskitems:
                try:
                    its = units[it]
                except KeyError:
                    its = []
                    units[it] = its
                its.append(convert_items(g))


    units = dict((it, d) for it, d in units.items() if len(d) > 1)  # units with pairable values
    n = sum(len(pv) for pv in units.values())  # number of pairable values
    
    if n == 0:
        raise ValueError("No items to compare.")
    
    np_metric = (np is not None) and ((metric in (interval_metric, nominal_metric, ratio_metric)) or force_vecmath)
    
    Do = 0.
    for grades in units.values():
        if np_metric:
            gr = np.asarray(grades)
            Du = sum(np.sum(metric(gr, gri)) for gri in gr)
        else:
            Du = sum(metric(gi, gj) for gi in grades for gj in grades)
        Do += Du/float(len(grades)-1)
    Do /= float(n)

    if Do == 0:
        return 1.

    De = 0.
    for g1 in units.values():
        if np_metric:
            d1 = np.asarray(g1)
            for g2 in units.values():
                De += sum(np.sum(metric(d1, gj)) for gj in g2)
        else:
            for g2 in units.values():
                De += sum(metric(gi, gj) for gi in g1 for gj in g2)
    De /= float(n*(n-1))

    return 1.-Do/De if (Do and De) else 1.

In [217]:
# Crowdsourcing Exercise
import krippendorff
import pandas as pd
import numpy as np

def nominal_metric(a, b):
    return a != b


def interval_metric(a, b):
    return (a-b)**2

class crowdSourcer():
    
    def __init__(self, sourcePath):
        self.data = pd.read_csv(sourcePath)
        # data = pd.read_csv(sourcePath,header=1)

        # The feature phrases removed
        cols = []
        for i in range(29):
            cols.append("feature"+str(i+1))
        # print(cols)

        
        self.ratings = self.data.drop(["Sentence Index","Ground Truth Labels"] + cols,axis=1)
        self.groundTruth = self.data["Ground Truth Labels"]
        self.users = self.ratings.columns
        # features = data.drop(["Sentence Index","Ground Truth Labels"] + users.values[:], axis = 1)
        # print(self.ratings)

    def calcKrippendorff(self):
        missing = ''        
        # print(ratings)
        kripCoef = krippendorff.alpha(reliability_data=self.ratings)
        # kripNominalCoef = krippendorff.alpha(reliability_data=self.ratings, metric=nominal_metric, missing_items=missing)
        # kripIntervalCoef = krippendorff.alpha(reliability_data=self.ratings, metric=interval_metric, missing_items=missing)
        print("Krippendorff metric: %.3f" % kripCoef)
        

collabData = crowdSourcer("GlobalSheet.csv")
collabData.calcKrippendorff()

Krippendorff metric: 0.005
