In [138]:
%matplotlib inline

import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
from nltk.corpus import wordnet as wn

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [117]:
BASE = "./data/"
train_data = pd.read_csv(BASE + "train.csv")
test_data = pd.read_csv(BASE + "test.csv")
sub_data = pd.read_csv(BASE + "sample_submission.csv")

In [118]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [119]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [120]:
sub_data.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [121]:
def jitter(values, sd=0.25):
    return [np.random.normal(v, sd) for v in values]

In [122]:
def clean_text(df, col):
    """A function for keeping only alpha-numeric
    characters and replacing all white space with
    a single space.
    """
    return df[col].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x.lower()))\
                  .apply(lambda x: re.sub('\s+', ' ', x).strip())

In [123]:
def count_pattern(df, col, pattern):
    """Count the occurrences of `pattern`
    in df[col].
    """
    df = df.copy()
    return df[col].str.count(pattern)

In [124]:
def split_on_word(text):
    """Use regular expression tokenizer.
    Keep apostrophes.
    Returns a list of lists, one list for each sentence:
        [[word, word], [word, word, ..., word], ...].
    """
    if type(text) is list:
        return [regexp_tokenize(sentence, pattern="\w+(?:[-']\w+)*") for sentence in text]
    else:
        return regexp_tokenize(text, pattern="\w+(?:[-']\w+)*")

In [125]:
def normalize(tokenized_words):
    """Removes stop words, numbers, short words, and lowercases text.
    Returns a list of lists, one list for each sentence:
        [[word, word], [word, word, ..., word], ...].
    """
    stop_words = stopwords.words('english')
    return [[w.lower() for w in sent
             if (w.lower() not in stop_words)]
            for sent in tokenized_words]

In [126]:
def features(df):
    df = df.copy()
    df['n_questionmarks'] = count_pattern(df, 'text', '\?')
    df['n_periods'] = count_pattern(df, 'text', '\.')
    df['n_apostrophes'] = count_pattern(df, 'text', '\'')
    df['first_word'] = df.clean_text.apply(lambda x: split_on_word(x)[0])
    question_words = ['what', 'how', 'why', 'is']
    for w in question_words:
        col_wc = 'n_' + w
        col_fw = 'fw_' + w
        df[col_wc] = count_pattern(df, 'clean_text', w)
        df[col_fw] = (df.first_word == w) * 1
        
    del df['first_word']
    
    df['n_words'] = df.text.apply(lambda x: len(split_on_word(x)))
    return df

In [127]:
def flatten_words(list1d, get_unique=False):
    qa = [s.split() for s in list1d]
    if get_unique:
        return sorted(list(set([w for sent in qa for w in sent])))
    else:
        return [w for sent in qa for w in sent]

In [128]:
train_data['clean_text'] = clean_text(train_data, 'text')
test_data['clean_text'] = clean_text(test_data, 'text')

In [129]:
all_text = train_data['clean_text'].values.tolist() + test_data['clean_text'].values.tolist()
vocab = flatten_words(all_text, get_unique=True)
tfidf = TfidfVectorizer(stop_words='english', vocabulary=vocab)
training_matrix = tfidf.fit_transform(train_data.clean_text)
test_matrix = tfidf.fit_transform(test_data.clean_text)

In [130]:
training_matrix

<7613x27791 sparse matrix of type '<class 'numpy.float64'>'
	with 73763 stored elements in Compressed Sparse Row format>

In [131]:
train_data = features(train_data)
train_data = pd.concat([train_data, pd.DataFrame(training_matrix.todense())], axis=1)
train_data.head(3)

Unnamed: 0,id,keyword,location,text,target,clean_text,n_questionmarks,n_periods,n_apostrophes,n_what,...,27781,27782,27783,27784,27785,27786,27787,27788,27789,27790
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,0,1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
test_data = features(test_data)
test_data = pd.concat([test_data, pd.DataFrame(test_matrix.todense())], axis=1)

In [133]:
X = train_data["clean_text"]
y = train_data["target"]
X_test = test_data["clean_text"]

In [134]:
X_for_tf_idf = pd.concat([X, X_test])
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf.fit(X_for_tf_idf)
X = tfidf.transform(X)
X_test = tfidf.transform(X_test)
del X_for_tf_idf

In [135]:
X_train, X_val, y_train, y_val  = train_test_split(X,y, test_size=0.2, random_state=1868)

In [136]:
parameters = { 
    'gamma': [0.7, 1, 'auto', 'scale']
}
model = GridSearchCV(SVC(kernel='rbf'), parameters, cv=4, n_jobs=-1).fit(X_train, y_train)

In [139]:
y_val_pred = model.predict(X_val)
accuracy_score(y_val, y_val_pred), f1_score(y_val, y_val_pred)

(0.7852921864740644, 0.6958139534883722)

In [140]:
confusion_matrix(y_val, y_val_pred)

array([[822,  54],
       [273, 374]], dtype=int64)

In [141]:
y_test_pred = model.predict(X_test)

In [143]:
sub_data["target"] = y_test_pred
sub_data.to_csv("submission4.csv",index=False)

In [85]:
svm = LinearSVC(dual=False, max_iter=5000)

In [86]:
train.head()

Unnamed: 0,id,keyword,location,text,target,clean_text,n_questionmarks,n_periods,n_apostrophes,n_what,...,27781,27782,27783,27784,27785,27786,27787,27788,27789,27790
2177,3120,debris,nbc washington,NBCNightlyNews: Malaysian Officials Say Debris...,1,3518,0,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6707,9607,thunder,gamertag: bexrayandvav,@HaydnExists so glad i saved them all at once ...,1,1943,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6819,9766,trapped,central chazifornia,salute to all the kids still trapped in adult ...,0,4391,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2198,3150,debris,"Bristol, UK",Interesting: MH370: Aircraft debris found on L...,1,2676,0,7,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
554,803,battle,,#Tb #throwback ??\n\n??~ You want a battle? He...,0,4876,7,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
X = train['clean_text'].values
y = train['target'].values
features_dev = dev['clean_text'].values

In [88]:
kf = KFold(n_splits=5)

In [89]:
svm.fit(X, y)

ValueError: Expected 2D array, got 1D array instead:
array=[3518. 1943. 4391. ... 1940. 1020. 1617.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [331]:
test = pd.concat([test, pd.DataFrame(test_matrix.todense())], axis=1)

MemoryError: 

In [284]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(tweets['clean_text'],tweets['target'],test_size=0.2,random_state = 145)

In [285]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [287]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(tweets['clean_text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [288]:
print(Tfidf_vect.vocabulary_)



In [289]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  81.02429415627051


In [290]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  80.3676953381484


In [291]:
test

Unnamed: 0,id,text,target,clean_text,0,1,2,3,4,5,...,21561,21562,21563,21564,21565,21566,21567,21568,21569,21570
0,1,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,"13,000 people receive #wildfires evacuation or...",1,13 000 people receive wildfires evacuation ord...,0.0,0.000000,0.420968,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,8,#RockyFire Update => California Hwy. 20 closed...,1,rockyfire update california hwy 20 closed in b...,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,10,#flood #disaster Heavy rain causes flash flood...,1,flood disaster heavy rain causes flash floodin...,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,13,I'm on top of the hill and I can see a fire in...,1,i m on top of the hill and i can see a fire in...,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,14,There's an emergency evacuation happening now ...,1,there s an emergency evacuation happening now ...,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,15,I'm afraid that the tornado is coming to our a...,1,i m afraid that the tornado is coming to our area,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [292]:
test['target'] = predictions_SVM
sub = test[['id','target']]
sub.to_csv("submission3.csv", index=False)
sub

ValueError: Length of values does not match length of index

Unnamed: 0,id,target
0,1,1
1,4,1
2,5,1
3,6,1
4,7,0
