In [2]:
from helpers.ExtractFeatures import ExtractFeatures,ReadFeatureFiles
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,precision_score,recall_score,f1_score,roc_curve,roc_auc_score

In [2]:
from nltk.tokenize import sent_tokenize
with open('/Users/deepak/Desktop/queries.txt','r') as file:
    queries = file.readlines()
queries=[text.strip() for text in queries]
indexes = []
queries_formatted = []
for i in range(len(queries)):
    sents = sent_tokenize(queries[i].strip())
    for j in sents:
        indexes.append(i)
        queries_formatted.append(j)

In [3]:
with open('random_for_survey.pickle','rb') as handle:
    queries_formatted = pickle.load(handle)

In [4]:
queries_formatted

['the purpose of such an integrated dual circuit evaporator being to improve part load performance of a refrigerating or air conditioning system when one circuit of the system is inactive .',
 'the method first transmits a response request message from the pcf to identify the status of two or more pdsns in the system .',
 'the actuator rotationally couples another of the rotational members of the second planetary gear set to the differential .',
 'the textile fabrics made from the present yarn have a smooth surface .',
 'smooth surface enabling the assembly to be picked up and placed using automatic equipment with a vision system .',
 'according to one embodiment .',
 'plastic or other suitable resin to form a panel or similar structure .',
 'various forms of vertical tensioning support can be provided to include soil nails .',
 'the mounting nut includes a first segment and a second segment attached by a hinge portion .',
 'and they are driven by each of separate hydraulic sources .',

In [5]:
class SpecificityModel:
    def __init__(self):
        self.df_pdtb = pd.read_csv('pdtb2.csv',low_memory=False)
        self.df_patent = pd.read_csv('bigPatentData_csv/train.csv')
    
    def extract_features(self):
        fe = ExtractFeatures(self.df_pdtb,self.df_patent)
        #fe.extract_features('i')
        #fe.extract_features('s')
        fe.extract_features('b')
    
    def read_features(self):
        self.obj_read_feats = ReadFeatureFiles()
        self.obj_read_feats.read_features()
        
    def train(self,sent_type='i',split_size=0.1):
        if sent_type == 'i':
            df_wo_labels = self.obj_read_feats.df_i[self.obj_read_feats.df_i.columns.drop('labels')]
            self.in_feats = np.concatenate((df_wo_labels.values,self.obj_read_feats.wf_i),axis = 1)
            self.y_true = self.obj_read_feats.df_i.labels.values
            str_folder_name = 'instantiation'
        elif sent_type == 's':
            df_wo_labels = self.obj_read_feats.df_s[self.obj_read_feats.df_s.columns.drop('labels')]
            self.in_feats = np.concatenate((df_wo_labels.values,self.obj_read_feats.wf_s),axis = 1)
            self.y_true = self.obj_read_feats.df_s.labels.values
            str_folder_name = 'specification'
        elif sent_type == 'b':
            df_wo_labels = self.obj_read_feats.df_b[self.obj_read_feats.df_b.columns.drop('labels')]
            self.in_feats = np.concatenate((df_wo_labels.values,self.obj_read_feats.wf_b),axis = 1)
            self.y_true = self.obj_read_feats.df_b.labels.values
            str_folder_name = 'combined'
            
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.in_feats, self.y_true, test_size=split_size, random_state=5,shuffle=True)
        self.clf = LogisticRegression(solver='liblinear')
        self.clf.fit(np.asarray(self.X_train), np.asarray(self.y_train))
        
        
        with open('models/'+str_folder_name+'/'+'specificity_model.pickle', 'wb') as handle:
            pickle.dump(self.clf, handle, protocol=pickle.HIGHEST_PROTOCOL) 
        
        
    def predict(self):
        self.y_pred = self.clf.predict_proba(np.asarray(self.X_test))
        y_pred_int = np.argmax(1*(self.y_pred > 0.6),axis=1)
        return y_pred_int
    
    def predict_external_set(self,sents):
        df_sents = pd.DataFrame({'text':sents,'labels':np.ones(len(sents),dtype=int)})
        fe = ExtractFeatures(df_pdtb = df_sents, df_patent=self.df_patent,state='test')
        fe.extract_test_features()
        
        df_t,wf_t = self.obj_read_feats.read_files(sent_type='t')
        df_wo_labels = df_t[df_t.columns.drop('labels')]
        in_feats = np.concatenate((df_wo_labels.values,wf_t),axis = 1)
        
        y_pred = self.clf.predict_proba(np.asarray(in_feats))
        y_pred_int = np.argmax(1*(y_pred > 0.6),axis=1)
        return y_pred_int
        
    def cross_val_scores(self,folds=10):
        clf = LogisticRegression(solver='liblinear')
        cv = KFold(n_splits=folds, random_state=1, shuffle=True)
        scores = cross_val_score(clf, self.in_feats, self.y_true, scoring='accuracy', cv=cv, n_jobs=-1)
        return scores
    

In [6]:
a = SpecificityModel()

In [7]:
#a.extract_features()
a.read_features()

Instantiation Features found!
features/instantiation/necd_features.pickle
features/instantiation/polarity_features.pickle
features/instantiation/sentence_length_features.pickle
features/instantiation/specificity_features.pickle
features/instantiation/syntactic_features.pickle
features/instantiation/lm_features.pickle
Specification Features found!
features/specification/necd_features.pickle
features/specification/polarity_features.pickle
features/specification/sentence_length_features.pickle
features/specification/specificity_features.pickle
features/specification/syntactic_features.pickle
features/specification/lm_features.pickle
Combined Features found!
features/combined/necd_features.pickle
features/combined/polarity_features.pickle
features/combined/sentence_length_features.pickle
features/combined/specificity_features.pickle
features/combined/syntactic_features.pickle
features/combined/lm_features.pickle
Combined Features found!
features/test/necd_features.pickle
features/test/pola

In [8]:
a.train('i')

In [9]:
ans = a.predict_external_set(queries_formatted)

Dictionary not found! Creating one...


Counting number of documents with the word.: 100%|████████████████████ [324/324]
Collecting Polarity features.: 100%|████████████████████████████████████ [40/40]
Collecting Sentence Length features.: 100%|█████████████████████████████ [40/40]
Collecting Specificity features.: 100%|█████████████████████████████████ [40/40]
Collecting NE+CD features.: 100%|███████████████████████████████████████ [40/40]
Collection syntactic features.: 100%|███████████████████████████████████ [40/40]
Collection Language Model features.: 100%|██████████████████████████████ [40/40]


Collecting word features.
features/test/necd_features.pickle
features/test/polarity_features.pickle
features/test/sentence_length_features.pickle
features/test/specificity_features.pickle
features/test/syntactic_features.pickle
features/test/lm_features.pickle


In [11]:
df_survey = pd.DataFrame({'sents':queries_formatted,'preds':ans})

In [12]:
df_survey

Unnamed: 0,sents,preds
0,the purpose of such an integrated dual circuit...,1
1,the method first transmits a response request ...,1
2,the actuator rotationally couples another of t...,0
3,the textile fabrics made from the present yarn...,0
4,smooth surface enabling the assembly to be pic...,1
5,according to one embodiment .,0
6,plastic or other suitable resin to form a pane...,0
7,various forms of vertical tensioning support c...,1
8,the mounting nut includes a first segment and ...,0
9,and they are driven by each of separate hydrau...,0


In [47]:
gen_plus_sp = []
only_gen = []
for i in range(91):
    np_ind = df_survey.loc[df_survey['index']==i].values
    if 1 in np_ind[:,2]:
        indexes = np.where(np_ind[:,2] == 0)[0]
        gen_sents = np_ind[:,1][indexes]
        if len(gen_sents) > 0:
            only_gen.append(' '.join(gen_sents))
            gen_plus_sp.append(' '.join(np_ind[:,1]))

In [48]:
with open('only_general.pickle', 'wb') as handle:
    pickle.dump(only_gen, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('gen_and_sp.pickle', 'wb') as handle:
    pickle.dump(gen_plus_sp, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [40]:
np.where(df_survey.loc[df_survey['index']==1].values[:,2] == 0)[0]

array([0, 1])

In [19]:
df_survey.loc[df_survey['index']==np.array(set(df_survey.loc[(df_survey['preds']==1)]['index'].values))]

Unnamed: 0,index,sents,preds


In [41]:
indexes = []
sent_counts = []
sp_counts=[]
prods = []
for i in range(91):
    df_article = df_survey.loc[df_survey['index']==i]
    sent_count = df_article.shape[0]
    specific_count = df_survey.loc[(df_survey['index']==i) & (df_survey['preds']==1)].shape[0]
    product = specific_count/sent_count
    prods.append(product)
    sent_counts.append(sent_count)
    sp_counts.append(specific_count)
    indexes.append(i)

In [42]:
df_results=pd.DataFrame({'index':indexes,'sent_counts':sent_counts,'sp_counts':sp_counts,'ratio':prods})

In [43]:
sp_counts/sent_counts

TypeError: unsupported operand type(s) for /: 'list' and 'list'

In [45]:
df_results.to_csv('/Users/deepak/Desktop/sp_results.csv')

In [None]:
preds=a.predict()

In [None]:
len(preds)

In [None]:
dec_f = a.clf.score(a.X_test,a.y_test)

In [None]:
a.cross_val_scores()

In [None]:
in_feats = a.obj_read_feats.df_i.sample(50)

In [None]:
feat_key = 'mpqa_norm_score'
feat_selected = in_feats[[feat_key,'labels']]

In [None]:
a = feat_selected.sort_values(by=[feat_key], ascending=True)
a

In [None]:
len(feat_selected.loc[(feat_selected[feat_key] > 10) & (feat_selected.labels==1)])

In [None]:
feats_n = in_feats

In [None]:
X_train, X_test, y_train, y_test = train_test_split(feats_n, a.y_true, test_size=0.2, random_state=15,shuffle=True)
clf = LogisticRegression(solver='liblinear')
clf.fit(np.asarray(X_train), np.asarray(y_train))

In [None]:
y_pred = clf.predict_proba(np.asarray(X_test))
y_pred_int = np.argmax(1*(y_pred > 0.5),axis=1)

In [None]:
print(accuracy_score(a.y_test,preds))

In [None]:
print(classification_report(a.y_test,preds))

In [None]:
print(confusion_matrix(a.y_test,preds))

In [None]:
print(f1_score(a.y_test,preds))

In [None]:
a.y_pred

In [None]:
ns_probs = [0 for _ in range(len(a.y_test))]

fpr0, tpr0, thresholds0 = roc_curve(a.y_test,ns_probs)
fpr, tpr, thresholds = roc_curve(a.y_test,a.y_pred[:,1])

In [None]:
plt.figure(figsize=(12,10))
plt.plot(fpr0,tpr0, linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='Logistic',color='red')
plt.xlabel('False Positive Rate (FPR)', fontsize=18)
plt.ylabel('True Positive Rate (TPR)', fontsize=18)
#plt.show()
plt.xticks(fontsize=16, rotation=0)
plt.yticks(fontsize=16, rotation=0)
plt.savefig('/Users/deepak/Desktop/roc.png')

In [None]:
auc = roc_auc_score(a.y_test,a.y_pred[:,1])

In [None]:
auc

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

In [None]:
optimal_threshold