<a href="https://www.kaggle.com/code/sachinpatil1280/tweets-classification-deep-learning-score-0-7750?scriptVersionId=144401117" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Lib

In [None]:
# Bascic
import numpy as np
import pandas as pd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Feature Enginering
import nltk
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

# Deep learnig module
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from keras.utils import plot_model

# Import train and test 

In [None]:
train_sample = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_sample = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
train_sample.head()

# Basic Analysis

In [None]:
test_sample.head()

In [None]:
train_sample.info()

In [None]:
test_sample.info()

In [None]:
y_train = train_sample['target']
y_train

In [None]:
X_train = train_sample.drop(columns='target')
X_train

# Missing Value

In [None]:
y_train.isna().sum()

In [None]:
X_train.isna().sum()

In [None]:
X_test = test_sample
X_test.isna().sum()

In [None]:
X_train.fillna('0',inplace= True)
X_test.fillna('0',inplace=True)

In [None]:
X_train.drop(columns='id',inplace =True)
X_test.drop(columns='id',inplace =True)

# EDA

In [None]:
# Countplot for number of target present in train data
sns.countplot(data = train_sample,x='target')
plt.title('Distribution of target values')

In [None]:
# Grouping by Keywords to see the count of keywords
keyword = train_sample.groupby('keyword')['target'].count()
df_key = pd.DataFrame({'keywords':keyword.index,'count':keyword.values}).sort_values(by='count',ascending=False)
df_key.head()

In [None]:
# Top 30 keywords in the Tweets.
plt.figure(figsize=(14,5))
sns.barplot(data=df_key.head(30),x='keywords',y='count')
plt.xticks(rotation = 50)
plt.title('Top 30 keywords on Tweets')
plt.tight_layout()
plt.show()

In [None]:
# Grouping by Location to find the count of each location
location = train_sample.groupby('location')['target'].count()
df_loc = pd.DataFrame({'location':location.index,'count':location.values}).sort_values(by='count',ascending=False)
df_loc.head()

In [None]:
# Top 30 Location in the tweets
plt.figure(figsize=(14,5))
sns.barplot(data=df_loc.head(30),x='location',y='count')
plt.xticks(rotation = 50)
plt.title('Top 30 locations of Tweets')
plt.tight_layout()
plt.show()

In [None]:
# Length of the text in each tweet for train data
length_train = train_sample['text'].apply(len)
length_train

In [None]:
# distribution of text length for Train data
plt.figure(figsize=(14,5))
sns.histplot(data = length_train,bins=70)
plt.title("Distribution of sentence lengths for Train data")
plt.tight_layout()
plt.show()

In [None]:
# Length of text in test data
length_test = test_sample['text'].apply(len)
length_test

In [None]:
# Distribution of sentence lengths for Test data
plt.figure(figsize=(14,5))
sns.histplot(data = length_test,bins=70)
plt.title("Distribution of sentence lengths for Test data")
plt.tight_layout()
plt.show()

In [None]:
# Distribution of sentence lengths With respect to target
plt.figure(figsize=(14,5))
sns.histplot(data=train_sample,x=length_train,hue='target',palette='hsv',kde= True,bins=70)
plt.title("Distribution of sentence lengths With respect to target")
plt.tight_layout()
plt.show()

In [None]:
# HistPlot: Sentence Length vs. Target
plt.figure(figsize=(14,5))
sns.histplot(data = train_sample,y=length_train,x='target',legend= True)
plt.xlabel('Sentence Length')
plt.ylabel('Target')
plt.title('Scatter Plot: Sentence Length vs. Target')
plt.show()

# Feature Enginering

In [None]:
# function for preprocess for text
stemmer = PorterStemmer()
def preprocess_text(text):
    cleaned_text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)

    tokens = nltk.word_tokenize(cleaned_text)
    
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords.words('english')]
    return ' '.join(filtered_tokens)

In [None]:
# preprocess train dataset
X_train["text"] = X_train["text"].apply(preprocess_text)

In [None]:
# Define Diaster and Non-Disaster
disaster = ' '.join(X_train[train_sample['target']==1]['text'])
non_disaster =''.join(X_train[train_sample['target']==0]['text'])

In [None]:
# Disaster Wordcloud
plt.figure(figsize=(14,7))
wordcloud = WordCloud(width=1000,height=500).generate(disaster)
plt.imshow(wordcloud,interpolation='bilinear',cmap='Dark2')
plt.axis('off')
plt.tight_layout()
plt.title('Disaster Wordcloud',fontsize= 25,color='Red')
plt.show()

In [None]:
# Non-Disaster Wordcloud
plt.figure(figsize=(14,7),frameon=True)
wordcloud = WordCloud(width=800,height=400).generate(non_disaster)
plt.imshow(wordcloud,interpolation='bilinear',cmap='Dark2')
plt.axis('off')
plt.tight_layout()
plt.title('Non-Disaster Wordcloud',fontsize= 25,color='Black')
plt.show()

# Model 

In [None]:
preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder = hub.KerasLayer('https://tfhub.dev/google/experts/bert/wiki_books/sst2/2')

In [None]:
# Layers for Deep learning
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text-layer')
preprocessed_text = preprocess(text_input)
outputs = encoder(preprocessed_text)
d_layer = tf.keras.layers.Dropout(0.1, name="dropout-layer1")(outputs['pooled_output'])
dense_layer1 = tf.keras.layers.Dense(256, activation='relu', name="dense-layer1")(d_layer)
dropout_layer2 = tf.keras.layers.Dropout(0.1, name="dropout-layer2")(dense_layer1)
dense_layer2 = tf.keras.layers.Dense(32, activation='relu', name="dense-layer2")(dropout_layer2)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(dense_layer2)
model = tf.keras.Model(inputs=[text_input], outputs = [output_layer])

In [None]:
# model layers and process
plot_model(model, to_file='model.png', show_shapes=True)

In [None]:
# metrices
m= [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=m)

In [None]:
# Fit the model
model.fit(X_train['text'], y_train, epochs=10)

In [None]:
# predict for train set
y_pred = model.predict(X_train['text'])
y_pred = y_pred.flatten()
y_pred = np.where(y_pred > 0.5, 1, 0)

# Submission

In [None]:
sample = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
# preprocess text dataset
X_test['text'] = X_test["text"].apply(preprocess_text)

In [None]:
# prediction fot test dataset
y_predicted = model.predict(X_test['text'])
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.5, 1, 0)

In [None]:
sample["target"] = y_predicted
sample.head()

In [None]:
# Submission
sample.to_csv("submission.csv", index=False)