In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [2]:
df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df['text'][0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [4]:
# Separate the two classes
df_target_0 = df[df['target'] == 0]
df_target_1 = df[df['target'] == 1]

# Find the number of samples in the minority class
n_samples = min(len(df_target_0), len(df_target_1))

# Randomly sample from the majority class to get the same number of samples as the minority class
df_target_0 = df_target_0.sample(n=n_samples, random_state=0)
df_target_1 = df_target_1.sample(n=n_samples, random_state=0)

# Concatenate the two DataFrames to get the final balanced DataFrame
df = pd.concat([df_target_0, df_target_1])
df.reset_index(drop=True, inplace=True)

In [5]:
df

Unnamed: 0,id,keyword,location,text,target
0,8520,screaming,All around the world,@ArianaGrande @justinbieber I'M SCREAMING OMG ...,0
1,9000,stretcher,Austin/Los Angeles,Stretcher brought out for Vampiro. Cut to comm...,0
2,9395,survivors,,Dying with debt can be costly for survivors,0
3,1225,blizzard,,What is the biggest regret you have in hearths...,0
4,2496,collided,See the barn of bleakness,OMG OMG OMG #JustinBieber and #HarryStyles hav...,0
...,...,...,...,...,...
6537,3125,debris,New York,Malaysian Officials Say Debris Found on Reunio...,1
6538,10866,,,Suicide bomber kills 15 in Saudi security site...,1
6539,5712,forest%20fire,PDX,BE CAREFUL anyone who lives west of Beaverton....,1
6540,8972,storm,Alberta,'Calgarians stunned by storm insurance compani...,1


In [6]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
corpus = []
for i in range(0, df.shape[0]):
  review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
  review = review.lower()
  review = review.split()
  lemmatizer = WordNetLemmatizer()
  review = ' '.join(review)
  corpus.append(review)


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500,stop_words='english')
X = cv.fit_transform(corpus).toarray()
y = df['target']

In [8]:
len(X[0])

1500

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [10]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Model selection

In [11]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

svc = SVC()
gaussian = GaussianNB()
cart_model = DecisionTreeClassifier(criterion='gini', random_state=0)
maxent_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=0,max_iter=1000)

model_list = {'GaussianNB':gaussian,'SVC':svc,'CART': cart_model , 'Maximum Entropy': maxent_model}


for model_name,model in model_list.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model_name,accuracy_score(y_test, y_pred))

GaussianNB 0.7188693659281895
SVC 0.7807486631016043
CART 0.7295645530939648
Maximum Entropy 0.7616501145912911


In [12]:
final_model = SVC(kernel ='rbf',C=1,gamma=0.1)
final_model.fit(X_train,y_train)
y_pred = final_model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7807486631016043


In [13]:
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
corpus_test = []

for i in range(0, df_test.shape[0]):
  review = re.sub('[^a-zA-Z]', ' ', df_test['text'][i])
  review = review.lower()
  review = review.split()
  lemmatizer = WordNetLemmatizer()
  review = ' '.join(review)
  corpus_test.append(review)

X_test_real = cv.transform(corpus_test).toarray()


new_predict = final_model.predict(X_test_real)

In [14]:
my_submission = pd.DataFrame({'id': df_test['id'], 'target': new_predict.reshape(-1)})
my_submission.to_csv('submission.csv', index=False)