# NLP
Disaster tweets

### Importation of necessary packages

In [344]:
! pip install utils



In [345]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
import random
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phili\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Lecture of the dataset and preprocessing

In [346]:
#path to dataset
#CHANGE AS NEEDED
df=pd.read_csv (r'C:\Users\phili\Documents\Machine Learning\Kaggle\NLP\train.csv')

- Size and first lines of the dataframe:

In [347]:
print("Size of the dataset :",df.shape)
df.head()

Size of the dataset : (7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


- Delete unnecessary columns

In [348]:
del df["id"]
del df["keyword"]
del df["location"]

### Preprocessing of text

In [349]:
# Lower casing
def lower_casing(text):
    return text.str.lower()

In [350]:
# Punctuation removal
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', string.punctuation))

In [351]:
# Remove stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [352]:
# Stemming
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

In [353]:
# Removing of url and html
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [354]:
# Tokenization
def tokenization(text):
    token=[]
    current=""
    for i in range(len(text)):
        if text[i]==" ":
            token.append(current)
            current=""
        else:
            current=current+text[i]
    token.append(current)
    return token

In [355]:
def textProcessing(col):
    col = lower_casing(col)
    col = col.apply(lambda text: remove_punctuation(text))
    col = col.apply(lambda text: remove_stopwords(text))
    col = col.apply(lambda text: stem_words(text))
    col = col.apply(lambda text: remove_urls(text))
    col = col.apply(lambda text: remove_html(text))
    col = col.apply(lambda text: tokenization(text))
    return col

df["text"]=textProcessing(df["text"])

- Example

In [356]:
rd=random.randint(0,len(df["target"]))
print(df["text"][rd],df["target"][rd])

['perspect', 'grate', 'dead', 'critic', 'write', 'contribut', 'studi', 'httptcofmu0fnumxf', 'httptcoaggryhvxkr'] 1


### Creation of the dictionary

In [357]:
def createDict(df):
    dictfreq={}
    for i in range(len(df)):
        for j in range(len(df["text"][i])):
            if df["text"][i][j] not in dictfreq:
                if df["target"][i]==1:
                    dictfreq[df["text"][i][j]]=[0,1]
                else:
                    dictfreq[df["text"][i][j]]=[1,0]
            else:
                if df["target"][i]==1:
                    dictfreq[df["text"][i][j]][1]+=1
                else:
                    dictfreq[df["text"][i][j]][0]+=1
    return dictfreq

dictfreq=createDict(df)

- Features engineering

In [358]:
def extract_features(tweet, dictfreq):
    x =[0,0]
    for word in tweet:
        if word in dictfreq.keys():
            x[0] += dictfreq[word][1]
            x[1] += dictfreq[word][0]
    return x

newcol1=[]
newcol2=[]

for i in range(len(df["text"])):
    temp= extract_features(df["text"][i], dictfreq)
    newcol1.append(temp[0])
    newcol2.append(temp[1])
    
df["new1"]=newcol1
df["new2"]=newcol2

In [359]:
df.head()

Unnamed: 0,text,target,new1,new2
0,"[deed, reason, earthquak, may, allah, forgiv, us]",1,194,156
1,"[forest, fire, near, la, rong, sask, canada]",1,401,121
2,"[resid, ask, shelter, place, notifi, offic, ev...",1,247,124
3,"[13000, peopl, receiv, wildfir, evacu, order, ...",1,434,145
4,"[got, sent, photo, rubi, alaska, smoke, wildfi...",1,195,215


### Model

In [360]:
del df["text"]

y=df["target"].values
df=df.loc[:, df.columns != "target"]

from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression()
regressor.fit(df, y)

LogisticRegression()

In [361]:
y_pred=regressor.predict(df)
print("Accuracy on the train:", accuracy_score(y_pred,y))

Accuracy on the train: 0.7964008932089847


### Submission part

In [362]:
#path to dataset
#CHANGE AS NEEDED
sub=pd.read_csv (r'C:\Users\phili\Documents\Machine Learning\Kaggle\NLP\test.csv')

- Preprocess of the test dataset

In [363]:
del sub["id"]
del sub["keyword"]
del sub["location"]

sub["text"] = textProcessing(sub["text"])

newcol1=[]
newcol2=[]

for i in range(len(sub["text"])):
    temp= extract_features(sub["text"][i], dictfreq)
    newcol1.append(temp[0])
    newcol2.append(temp[1])
    
sub["new1"]=newcol1
sub["new2"]=newcol2

del sub["text"]

- Prediction of the test set

In [364]:
y_submission_pred=regressor.predict(sub)
y_submission_pred=y_submission_pred.astype(int)

In [365]:
# Export
# CHANGE AS NEEDED
export=pd.read_csv (r'C:\Users\phili\Documents\Machine Learning\Kaggle\NLP\test.csv')
export=pd.DataFrame(export["id"])
export["target"]=y_submission_pred
export.to_csv (r'C:\Users\phili\Documents\Machine Learning\Kaggle\NLP\export_sub.csv', index = False, header=True)