In [167]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC 
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [95]:
os.chdir('30_TF-IDF Features by Disaster/10_2018 Train')

In [151]:
os.chdir('30_TF-IDF Features by Disaster/30_2019 B Test')

In [149]:
os.getcwd()

'C:\\Users\\Vishaal\\Documents\\GitHub\\TREC_Distributed_Machine_Learning\\TREC'

In [144]:
df = pd.read_csv('Attack_tfidf_features_2018_train.csv')

In [152]:
df2 = pd.read_csv('Shooting_tfidf_features_2019_B_test.csv')

In [153]:
df_c = pd.DataFrame()
df_c['Tweet'] = pd.concat([df['Tweet'] , df2['Tweet']])
df_c['Priority'] = pd.concat([df['Priority'] , df2['Priority']])

In [154]:
df_c.head()

Unnamed: 0,Tweet,Priority
0,RT @cheerio15: Restaurant in Paris 10th distri...,Low
1,public transport problems in paris due to five...,Medium
2,RT @AP: BREAKING: French police official confi...,High
3,At least 3 of the dead attackers at Bataclan t...,Medium
4,RT @michaelh992: PT: According to Police Union...,Critical


In [155]:
'''
Creating a categorical variable to keep label critical tweets as 1 and 0 otherwise
'''
t = []
for element in df_c['Priority']:
    if element =='Critical':
        t.append(1)
    else:
        t.append(0)
        
t = np.array(t)
df_c['Target'] = t
'''
    Converting to categorical
'''
df_c['Target'] = df_c['Target'].astype('category')
t = df_c['Target']
del df_c['Target']

In [162]:
'''
    Creating a function to input lemmatized text to possibly another function that outputs the tfidf in a csv format.
    We could also simply use the output from this funtion in an tfidf format (no csv) and train a model.
'''
def preProcess(df):
    df['Tweet'] = df['Tweet'].astype('str')
    
    token_array = []
    for tweet in df['Tweet']:
        token_tweet = word_tokenize(tweet)
        token_array.append(token_tweet)
        
    stop_words=set(stopwords.words("english"))
    filtered_token_array=[]
    for tweet in token_array:
        filtered_tweet = []
        for word in tweet:
                if word not in stop_words:
                    filtered_tweet.append(word)
        filtered_token_array.append(filtered_tweet)
        
    lem = WordNetLemmatizer()
    stem = PorterStemmer()

    lemmatized_array=[]
    for tweet in filtered_token_array:
        lemmatized_tweet = []
        for word in tweet:
            lemmatized_tweet.append(lem.lemmatize(word,'v'))
        lemmatized_array.append(lemmatized_tweet)
    
    lemmatized_array_join = []
    for element in lemmatized_array:
        lemmatized_array_join.append(''.join(element))
        
    return (lemmatized_array_join)

In [None]:
'''
    OPTIONAL - We use this to convert the tfidf into CSV format. 
    
    Output is a combined dataframe - use pd.to_csv to specify save location 
    
    NOTE - corpus is tfidf in non-array format
'''
def tfidf_to_CSV(df, corpus):
    tf=TfidfVectorizer()
    text_tf= tf.fit_transform(corpus)
    text_tf_dense = text_tf.todense()
    
    words = tf.get_feature_names()
    M = text_tf.tolil()
    l_features = []
    for i in range(M.shape[0]):
        l_features.append(np.array(M[i].todense())[0])
        
    df_features = pd.DataFrame(l_features)
    
    df_combined = pd.concat([df,df_features], axis=1)
    
    cols = list(df.columns) + words
    
    df_combined_1 = df_combined.rename(columns={x:y for x,y in zip(df_combined.columns,cols)})
    
    return (df_combined_1)

In [182]:
'''
    DTM to get TF-IDF features
'''
tf=TfidfVectorizer()
lemmatized_array_join = preProcess(df_c)
text_tf= tf.fit_transform(lemmatized_array_join)

In [169]:
'''
    Converting TF-IDF to list of lists. Then we play around with the datatypes to get features as a dense
    list of numpy arrays l_features. We also get the actual word names that are used as features.
    
    Use only if you need the csv output. NOT REQUIRED TO TRAIN MODEL
'''
words = tf.get_feature_names()
M = text_tf.tolil()
l_features = []
for i in range(M.shape[0]):
    l_features.append(np.array(M[i].todense())[0])

In [216]:
'''
    Converting DTM to array. REQUIRED TO DIRECTLY TRAIN SVM
'''
text_tf= tf.fit_transform(lemmatized_array_join).toarray()

In [172]:
'''
    * Train SVM
    * ‘optimal’: eta = 1.0 / (alpha * (t + t0)) where t0 is chosen by a heuristic proposed by Leon Bottou.
    * This model is trained directly using the tfidf in the matrix form.
'''
clf = make_pipeline(StandardScaler(with_mean=False), SGDClassifier(loss = 'squared_loss', alpha = 0.0001, max_iter=10000000, tol=1e-3
                                                   , shuffle = True, learning_rate = 'optimal', penalty='l2' ))
clf.fit(text_tf[2066:,:], t[2066:])

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('sgdclassifier',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='squared_loss',
                               max_iter=10000000, n_iter_no_change=5,
                               n_jobs=None, penalty='l2', power_t=0.5,
                               random_state=None, shuffle=True, tol=0.001,
                               validation_fraction=0.1, verbose=0,
                               warm_start=False))],
         verbose=False)

In [173]:
y_pred = clf.predict(text_tf[0:2066,:])

In [177]:
print(metrics.recall_score(t[0:2066], y_pred))

0.1951219512195122


In [175]:
'''
    TN = 1785
    FN = 33
    TP = 8
    FP = 240
'''
metrics.confusion_matrix(t[0:2066], y_pred)

array([[1785,  240],
       [  33,    8]], dtype=int64)

In [176]:
print(metrics.precision_score(t[0:2066], y_pred))

0.03225806451612903


In [212]:
from sklearn.cluster import KMeans

In [250]:
km = KMeans(n_clusters = 2, init = 'random', max_iter = 3000000, tol = 0.000000001)

In [251]:
km.fit(text_tf[2066:,:])

KMeans(algorithm='auto', copy_x=True, init='random', max_iter=3000000,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=1e-09, verbose=0)

In [252]:
km_pred = km.predict(text_tf[0:2066,:])
np.unique(km_pred)

array([0])

In [253]:
print(metrics.recall_score(np.array(t[0:2066]), km_pred))

0.0


In [254]:
print(metrics.accuracy_score(np.array(t[0:2066]), km_pred))

0.9801548886737658


In [255]:
print(metrics.precision_score(np.array(t[0:2066]), km_pred))

0.0


  'precision', 'predicted', average, warn_for)


In [257]:
metrics.confusion_matrix(np.array(t[0:2066]), km_pred)

array([[2025,    0],
       [  41,    0]], dtype=int64)