<a href="https://www.kaggle.com/code/sachinpatil1280/tweet-classification-machine-learning?scriptVersionId=144397129" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Lib

In [None]:
# Basic
import pandas as pd 
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
from wordcloud import WordCloud

# Feature Enginerring
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay, accuracy_score


# Import train & test data

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
combine = [train,test]

# Basic Analyze

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.isnull().sum()

# Visualisation

In [None]:
plt.figure(figsize=(14,6))
sns.set_style('darkgrid')
sns.countplot(data=train, x='target',palette='pastel')
plt.title('CountPlot for Target')
plt.tight_layout()
plt.show()


In [None]:
train['length'] = train['text'].apply(len)

In [None]:
plt.figure(figsize=(20,8))
sns.histplot(data=train,x='length',hue='target',palette='hsv',kde= True,bins=70)
plt.title('Distrubution of text length')
plt.show()
plt.tight_layout()

In [None]:
def avgwordlen(strlist):
    sum=[]
    for i in strlist:
        sum.append(len(i))
    return sum

avgword_len_dis = train[train['target']==1]['text'].str.split().apply(avgwordlen).map(lambda x: np.mean(x))

avgword_len_non_dis = train[train['target']==0]['text'].str.split().apply(avgwordlen).map(lambda x: np.mean(x))

group_labels = ['Disaster', 'Non-Disaster']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot([avgword_len_dis, avgword_len_non_dis], group_labels, bin_size=.2, colors=colors,)

fig.update_layout(title_text="Average word length in tweets",title_x=0.5,xaxis_title="Text",yaxis_title="Density").show()

In [None]:
# Grouping by Keywords to see the count of keywords
keyword = train.groupby('keyword')['target'].count()
df_key = pd.DataFrame({'keywords':keyword.index,'count':keyword.values}).sort_values(by='count',ascending=False)

# Top 30 keywords in the Tweets.
plt.figure(figsize=(14,5))
sns.barplot(data=df_key.head(30),x='keywords',y='count')
plt.xticks(rotation = 50)
plt.title('Top 30 keywords on Tweets')
plt.tight_layout()
plt.show()

In [None]:
# Grouping by Location to find the count of each location
location = train.groupby('location')['target'].count()
df_loc = pd.DataFrame({'location':location.index,'count':location.values}).sort_values(by='count',ascending=False)

# Top 30 Location in the tweets
plt.figure(figsize=(14,5))
sns.barplot(data=df_loc.head(30),x='location',y='count')
plt.xticks(rotation = 50)
plt.title('Top 30 locations of Tweets')
plt.tight_layout()
plt.show()

# Fetaure Enginering

In [None]:
stemmer = PorterStemmer()

def preprocess_data(data):
    
    #removal of url
    text = re.sub(r'https?://\S+|www\.\S+|http?://\S+',' ',data) 
    
    #decontraction
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)    
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"ma\'am", " madam", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"shan\'t", " shall not", text)
    text = re.sub(r"sha\n't", " shall not", text)
    text = re.sub(r"o\'clock", " of the clock", text)
    text = re.sub(r"y\'all", " you all", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    
    #removal of html tags
    text = re.sub(r'<.*?>',' ',text) 
    
    # Match all digits in the string and replace them by empty string
    text = re.sub(r'[0-9]', '', text)
    text = re.sub("["
                           u"\U0001F600-\U0001F64F"  # removal of emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+",' ',text)
    
    # filtering out miscellaneous text.
    text = re.sub('[^a-zA-Z]',' ',text) 
    text = re.sub(r"\([^()]*\)", "", text)
    
    # remove mentions
    text = re.sub('@\S+', '', text)  
    
    # remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', text)  
    

    # Lowering all the words in text
    text = text.lower()
    text = text.split()
    
    text = [stemmer.stem(words) for words in text if words not in stopwords.words('english')]
    
    # Removal of words with length<2
    text = [i for i in text if len(i)>2] 
    text = ' '.join(text)
    return text

In [None]:
train['text'] = train['text'].apply(preprocess_data)
test['text'] = test['text'].apply(preprocess_data)

In [None]:
disaster = ' '.join(train[train['target']==1]['text'])
non_disaster =''.join(train[train['target']==0]['text'])

In [None]:
plt.figure(figsize=(14,6))
wordcloud = WordCloud(width=1000,height=500).generate(disaster)
plt.imshow(wordcloud,interpolation='bilinear',cmap='magma')
plt.axis('off')
plt.tight_layout()
plt.title('Disaster Wordcloud',fontsize= 25,color='Red')
plt.show()

In [None]:
plt.figure(figsize=(14,6),frameon=True)
wordcloud = WordCloud(width=800,height=400).generate(non_disaster)
plt.imshow(wordcloud,interpolation='bilinear',cmap='Dark2')
plt.axis('off')
plt.tight_layout()
plt.title('Non-Disaster Wordcloud',fontsize= 25,color='Black')
plt.show()

# Train Test Split

In [None]:
data = pd.concat([train,test])
X = data['text']
y = train['target']

In [None]:
train.shape,test.shape

In [None]:
tf = TfidfVectorizer()
data_vec=tf.fit_transform(X)
print(data_vec)

In [None]:
df_train =data_vec[:7613,:]
df_test = data_vec[7613:,:]

In [None]:
df_train.shape,df_test.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train , y, test_size=0.33, random_state=1)

# Models

In [None]:
lr = LogisticRegression()
svc = SVC(kernel='linear', degree=3, gamma='auto')
rf = RandomForestClassifier(criterion='gini', max_depth=10, min_samples_split=2, n_estimators=50, random_state=42)
mnb = MultinomialNB()
grd = GradientBoostingClassifier()
cat = CatBoostClassifier(verbose=False)
xgb = XGBClassifier()

# Logistic Regression

In [None]:
sns.set_style('white')
pipe_lr = make_pipeline(lr)
pipe_lr.fit(X_train,y_train)
pred = pipe_lr.predict(X_test)
print(classification_report(y_test,pred))
ConfusionMatrixDisplay(confusion_matrix(y_test,pred)).plot()
score_lr = accuracy_score(y_test,pred)

# SVC

In [None]:
pipe_svc = make_pipeline(svc)
pipe_svc.fit(X_train,y_train)
pred = pipe_svc.predict(X_test)
print(classification_report(y_test,pred))
ConfusionMatrixDisplay(confusion_matrix(y_test,pred)).plot()
score_svc = accuracy_score(y_test,pred)

# Random Forest Classifier

In [None]:
pipe_rf = make_pipeline(rf)
pipe_rf.fit(X_train,y_train)
pred = pipe_rf.predict(X_test)
print(classification_report(y_test,pred))
ConfusionMatrixDisplay(confusion_matrix(y_test,pred)).plot()
score_rf = accuracy_score(y_test,pred)

# Multinomial NB

In [None]:
pipe_mnb = make_pipeline(mnb)
pipe_mnb.fit(X_train,y_train)
pred = pipe_mnb.predict(X_test)
print(classification_report(y_test,pred))
ConfusionMatrixDisplay(confusion_matrix(y_test,pred)).plot()
score_mnb = accuracy_score(y_test,pred)

# Gradiant Boosting Classifier

In [None]:
pipe_grd = make_pipeline(grd)
pipe_grd.fit(X_train,y_train)
pred = pipe_grd.predict(X_test)
print(classification_report(y_test,pred))
ConfusionMatrixDisplay(confusion_matrix(y_test,pred)).plot()
score_grd = accuracy_score(y_test,pred)

# Cat Boost Classifier

In [None]:
pipe_cat = make_pipeline(cat)
pipe_cat.fit(X_train,y_train)
pred = pipe_cat.predict(X_test)
print(classification_report(y_test,pred))
s=ConfusionMatrixDisplay(confusion_matrix(y_test,pred)).plot()
score_cat = accuracy_score(y_test,pred)

# XG Boost

In [None]:
pipe_xgb = make_pipeline(xgb)
pipe_xgb.fit(X_train,y_train)
pred = pipe_xgb.predict(X_test)
print(classification_report(y_test,pred))
ConfusionMatrixDisplay(confusion_matrix(y_test,pred)).plot()
score_xgb = accuracy_score(y_test,pred)

# Voting Classifier

In [None]:
pipe_vt = VotingClassifier(estimators=[('lr',lr),('svc',svc),('mnb',mnb),('cat',cat)],voting='hard')
pipe_vt.fit(X_train,y_train)
pred_vt = pipe_vt.predict(X_test)
print(classification_report(y_test,pred_vt))
ConfusionMatrixDisplay(confusion_matrix(y_test,pred_vt)).plot()
score_vt = accuracy_score(y_test,pred)

In [None]:
data = {'modles': ['Logistic Regression','SVC','Random Forest Classifier','Multinomial NB','Gradient Boosting Classifier','Cat Boost Classifier','XGB Classifier','Voting Classifier'],
        'Score': [score_lr,score_svc,score_rf,score_mnb,score_grd,score_cat,score_xgb,score_vt]}
df =pd.DataFrame(data)
df['Score'] =df['Score']*100
df.sort_values(by='Score',ascending=False)

# Submission

In [None]:
submision = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
predict = pipe_mnb.predict(df_test)

In [None]:
submision.target = predict

In [None]:
submision.head()

In [None]:
submision.to_csv('submission.csv',index=False)