In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [3]:
df = pd.read_csv("train.csv")

In [4]:
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
df.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [7]:
df.drop(["location","id"],axis=1,inplace=True)

In [8]:
df.keyword.fillna(df.keyword.mode()[0],inplace=True)
df.reset_index(inplace=True)

In [9]:
df.keyword.unique()

array(['fatalities', 'ablaze', 'accident', 'aftershock',
       'airplane%20accident', 'ambulance', 'annihilated', 'annihilation',
       'apocalypse', 'armageddon', 'army', 'arson', 'arsonist', 'attack',
       'attacked', 'avalanche', 'battle', 'bioterror', 'bioterrorism',
       'blaze', 'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard',
       'blood', 'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'dera

In [10]:
# ps = PorterStemmer()
ps = WordNetLemmatizer()

In [11]:
tweet_corpus = []

In [12]:
for i in range (0,len(df)):
    tweet = re.sub('[^a-zA-Z]',' ',df['text'][i])
    tweet = tweet.lower()
    tweet = tweet.split()
    tweet = [ps.lemmatize(word) for word in tweet if not word in stopwords.words("english")]
    tweet = " ".join(tweet)
    tweet_corpus.append(tweet)

In [13]:
keyword_corpus = []

In [14]:
for i in range (0,len(df)):
    keyword = re.sub('[^a-zA-Z]',' ',df['keyword'][i])
    keyword = keyword.lower()
    keyword = keyword.split()
    keyword = [ps.lemmatize(word) for word in keyword if not word in stopwords.words("english")]
    keyword = " ".join(keyword)
    keyword_corpus.append(keyword)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features=5000)
X_tweet = tf.fit_transform(tweet_corpus).toarray()

In [16]:
X_tweet.shape

(7613, 5000)

In [17]:
tf = TfidfVectorizer(max_features=1000)
X_keyword = tf.fit_transform(keyword_corpus).toarray()

In [18]:
X_keyword.shape

(7613, 217)

In [19]:
X = []

In [20]:
for i in range(0,len(X_tweet)):
    X.append(np.concatenate((X_tweet[i],X_keyword[i])))

In [21]:
X = np.array(X)

In [22]:
X.shape

(7613, 5217)

In [23]:
y = df['target']

In [24]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=0)

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
# mlnb = MultinomialNB().fit(X_train,y_train)
# mlnb = MultinomialNB().fit(X,y)
mlnb = RidgeClassifier().fit(X_train, y_train)
# mlnb = RidgeClassifier().fit(X, y)
# mlnb.score(X,y)


# y_pred = mlnb.predict(X)
y_pred = mlnb.predict(X_test)

accuracy_score(y_test,y_pred)

0.8136482939632546

In [26]:
confusion_matrix(y_test,y_pred)

array([[396,  51],
       [ 91, 224]], dtype=int64)