In [1]:
# importing the Dataset

import pandas as pd
import re
import string

In [2]:
Train = pd.read_csv('train.csv')
Test_Data = pd.read_csv('test.csv')
Test = Test_Data.copy()

In [3]:
Train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
Test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
Test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [7]:
Train.fillna('other',inplace=True)

In [8]:
Test.fillna('other',inplace=True)

In [9]:
# First lets remove Punctuations from the Messages
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str
Train['text'] = Train['text'].apply(punctuation_removal)

In [10]:
# another way
#def prepare_the_data(text):
    #text = text.lower()
    #text = re.sub(r'http\S+', '', text) #remove url
    #text = re.sub(r'[^\w\s]', '', text) #remove punctuations
    #text = re.sub(r"0\S+" , '',  text) #remove words that start with
    #text = re.sub(r'\s+', ' ',   text) #remove duplicate whitespaces
    #return text
#Train['clean_text'] = Train['text'].apply(lambda x: prepare_the_data(x))

In [11]:
#%pip install nltk

In [12]:
#import nltk

In [13]:
#nltk.download('all')

In [14]:
# Now lets Remove the Stopwords also

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop = stopwords.words('english')
stop.append("i'm")

stop_words = []

for item in stop: 
    new_item = punctuation_removal(item)
    stop_words.append(new_item) 

def stopwords_removal(string):
    string = word_tokenize(string)
    return [word.lower() for word in string 
            if word.lower() not in stop_words ]

Train['text'] = Train['text'].apply(stopwords_removal)

In [15]:
# lets remove the Numbers also

import re
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ' '.join(list_text_new)

Train['text'] = Train['text'].apply(drop_numbers)

In [16]:
# same on Test Data
Test['text'] = Test['text'].apply(punctuation_removal)
Test['text'] = Test['text'].apply(stopwords_removal)
Test['text'] = Test['text'].apply(drop_numbers)

In [17]:
Train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,other,other,deeds reason earthquake may allah forgive us,1
1,4,other,other,forest fire near la ronge sask canada,1
2,5,other,other,residents asked shelter place notified officer...,1
3,6,other,other,people receive wildfires evacuation orders cal...,1
4,7,other,other,got sent photo ruby alaska smoke wildfires pou...,1


In [18]:
Test.head()

Unnamed: 0,id,keyword,location,text
0,0,other,other,happened terrible car crash
1,2,other,other,heard earthquake different cities stay safe ev...
2,3,other,other,forest fire spot pond geese fleeing across str...
3,9,other,other,apocalypse lighting spokane wildfires
4,11,other,other,typhoon soudelor kills china taiwan


In [19]:
Train_Data = Train.drop(columns=['target'])

In [20]:
y = Train['target']

In [21]:
pdList = [Train_Data, Test]  # List of your dataframes
new_df = pd.concat(pdList)

In [22]:
new_df

Unnamed: 0,id,keyword,location,text
0,1,other,other,deeds reason earthquake may allah forgive us
1,4,other,other,forest fire near la ronge sask canada
2,5,other,other,residents asked shelter place notified officer...
3,6,other,other,people receive wildfires evacuation orders cal...
4,7,other,other,got sent photo ruby alaska smoke wildfires pou...
...,...,...,...,...
3258,10861,other,other,earthquake safety los angeles ûò safety faste...
3259,10865,other,other,storm ri worse last hurricane hardest hit yard...
3260,10868,other,other,green line derailment chicago httptcoutbxlcbiuy
3261,10874,other,other,meg issues hazardous weather outlook hwo


In [23]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True)
New_Data = vectorizer.fit_transform(new_df['text']).toarray()

In [24]:
New_Data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
New_Data_df = pd.DataFrame(New_Data)

In [26]:
Train_x = New_Data_df.iloc[:7613,:21543]
Test_x = New_Data_df.iloc[7613:10876,:21543]

In [27]:
Train_x.shape

(7613, 21543)

In [28]:
Test_x.shape

(3263, 21543)

In [29]:
y.shape

(7613,)

In [30]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(Train_x, y)

In [31]:
Model_pred = spam_detect_model.predict(Test_x)

In [32]:
Model_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [33]:
Model_pred.shape

(3263,)

In [35]:
final_test = pd.DataFrame(data = {'id' : Test_Data.id.values, 'target': Model_pred})
final_test.to_csv('submission.csv', index = False)

In [121]:
submission.to_csv('submission.csv', index=False)

NameError: name 'submission' is not defined

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True)
train_x = vectorizer.fit_transform(Train['text']).toarray()
test = vectorizer.fit_transform(Test['text']).toarray()

In [65]:
train_x.shape

(7613, 17046)

In [67]:
y.shape

(7613,)

In [66]:
test.shape

(3263, 10063)

In [62]:
y = Train['target']

In [37]:
# Train Test Split

#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(train_x, y, test_size = 0.3, random_state = 0)

In [64]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(train_x, y)


In [68]:
spam_detect_model.predict(test)

ValueError: X has 10063 features, but MultinomialNB is expecting 17046 features as input.

In [40]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test,y_pred)
score

0.792031523642732

In [44]:
# predicting results for Test Data
y_predict =spam_detect_model.predict(test)

ValueError: X has 10063 features, but MultinomialNB is expecting 17046 features as input.