<a href="https://colab.research.google.com/github/RubensBritto/AlgoritmoGenetico/blob/main/Natural_Language_Processing_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORTS

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.lancaster import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# importing models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

# importing metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Download Dataset

In [None]:
url_train = 'https://raw.githubusercontent.com/joaovictorferro/Natural-Language-Processing-with-Disaster-Tweets/main/train.csv'
url_test = 'https://raw.githubusercontent.com/joaovictorferro/Natural-Language-Processing-with-Disaster-Tweets/main/test.csv'
url_sublime = 'https://raw.githubusercontent.com/joaovictorferro/Natural-Language-Processing-with-Disaster-Tweets/main/sample_submission.csv'
data = pd.read_csv (url_train)
data_test = pd.read_csv(url_test)
sublime = pd.read_csv(url_sublime)
print(data)

         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfires evacuation or...       1  
4     Just got sent this photo from Ruby #Alaska as ...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheTawniest The out of control w...       1  
7610  M1.94 [01:04 UT

# Clean Dataset

## Remove Collum Location

In [None]:
data = data.drop(['location','keyword'],axis=1)
data_test = data_test.drop(['location','keyword'],axis=1)
print(data)

         id                                               text  target
0         1  Our Deeds are the Reason of this #earthquake M...       1
1         4             Forest fire near La Ronge Sask. Canada       1
2         5  All residents asked to 'shelter in place' are ...       1
3         6  13,000 people receive #wildfires evacuation or...       1
4         7  Just got sent this photo from Ruby #Alaska as ...       1
...     ...                                                ...     ...
7608  10869  Two giant cranes holding a bridge collapse int...       1
7609  10870  @aria_ahrary @TheTawniest The out of control w...       1
7610  10871  M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...       1
7611  10872  Police investigating after an e-bike collided ...       1
7612  10873  The Latest: More Homes Razed by Northern Calif...       1

[7613 rows x 3 columns]


## Remove rows with NAN

In [None]:
data = data.dropna(subset = ['text'])
data_test = data_test.dropna(subset = ['text'])
print(data)

         id                                               text  target
0         1  Our Deeds are the Reason of this #earthquake M...       1
1         4             Forest fire near La Ronge Sask. Canada       1
2         5  All residents asked to 'shelter in place' are ...       1
3         6  13,000 people receive #wildfires evacuation or...       1
4         7  Just got sent this photo from Ruby #Alaska as ...       1
...     ...                                                ...     ...
7608  10869  Two giant cranes holding a bridge collapse int...       1
7609  10870  @aria_ahrary @TheTawniest The out of control w...       1
7610  10871  M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...       1
7611  10872  Police investigating after an e-bike collided ...       1
7612  10873  The Latest: More Homes Razed by Northern Calif...       1

[7613 rows x 3 columns]


## Remove character special and URLs

In [None]:
def modific(text):
  new_string =  re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text, flags=re.MULTILINE) #remove URL
  new_string = re.sub(r'@|#[a-zA-Z0-9]*',' ', new_string) #remove the word if start @
  new_string = re.sub(r'[0-9]+', ' ', new_string)
  new_string = new_string.lower()
  new_string  = re.sub(r"[^a-zA-Z0-9]"," ",new_string) #remove character special
  return new_string

In [None]:
for i,row in data.iterrows():
  data['text'][i] =  modific(str(data['text'][i]))

for i, row in data_test.iterrows():
  data_test['text'][i] =  modific(str(data_test['text'][i]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


## Stemming

In [None]:
from posixpath import join
def stemming(text):
  new_string = word_tokenize(text)
  stemminList = []
  for word in new_string: 
    stemminList.append(stemmer.stem(word))
    stemminList.append(' ')
  return ''.join(stemminList)

In [None]:
stemmer = LancasterStemmer()
for i,row in data.iterrows():
  data['text'][i] =  stemming(str(data['text'][i]))

for i,row in data_test.iterrows():
  data_test['text'][i] =  stemming(str(data_test['text'][i]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Countervectorizer

In [None]:
# X = data['text']
# Y = data['target']

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.3, random_state= 42)

In [None]:
cv = CountVectorizer(stop_words='english')
Data_train = cv.fit_transform(data['text']).toarray()
Data_test = cv.transform(data_test['text']).toarray()


In [None]:
y_train = np.array(data['target'])
y_test = np.array(sublime['target'])

## Regression Logistic with Counter Vectorizer

In [None]:
models = [LogisticRegression(), RandomForestClassifier(), SVC(), KNeighborsClassifier(), 
          MultinomialNB(), BernoulliNB()]
model_names = ['Logistic Regression', 'Random Forest', 'SVM', 'KNN', 'Multinomial NB', 'Bernoulli NB']

In [None]:
for model, model_name in zip(models, model_names):
    model_instance = model
    model_instance.fit(Data_train, y_train) # Fitting models 
    print(f'For {model_name}:')
    pred = model_instance.predict(Data_test) # Predictions on validation set
    print(f'Accuracy: {np.round(accuracy_score(y_test, pred) * 100, 2)}%')

For Logistic Regression:
Accuracy: 63.32%
For Random Forest:
Accuracy: 62.64%
For SVM:
Accuracy: 69.08%
For KNN:
Accuracy: 36.96%
For Multinomial NB:
Accuracy: 61.26%
For Bernoulli NB:
Accuracy: 65.31%


# TF-IDF

In [None]:
# TF-IDF in data_train
cv_tfidf = TfidfVectorizer(stop_words="english")
Data_X_train_tfidf = cv_tfidf.fit_transform(data['text']).toarray()
Data_X_test_tfidf = cv_tfidf.transform(data_test['text']).toarray()

In [None]:
for model, model_name in zip(models, model_names):
    model_instance = model
    model_instance.fit(Data_X_train_tfidf, y_train) # Fitting models 
    print(f'For {model_name}:')
    pred = model_instance.predict(Data_X_test_tfidf) # Predictions on validation set
    print(f'Accuracy: {np.round(accuracy_score(y_test, pred) * 100, 2)}%')

For Logistic Regression:
Accuracy: 65.92%
For Random Forest:
Accuracy: 61.69%
For SVM:
Accuracy: 67.97%
For KNN:
Accuracy: 8.73%
For Multinomial NB:
Accuracy: 67.21%
For Bernoulli NB:
Accuracy: 65.31%


# Submission

In [None]:
svm = SVC()
svm.fit(Data_X_train_tfidf,y_train)
predictions = svm.predict(Data_X_test_tfidf) # using trained SVM model to predict outcomes for test dataset

In [None]:
submission = data_test[['id']]
submission

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11
...,...
3258,10861
3259,10865
3260,10868
3261,10874


In [None]:
submission['target'] = predictions

In [None]:
submission.to_csv("submission.csv",index=None)