# **Natural Language Processing With Disaster Tweets**

# **1. Data Overview**

### **1.1. Meta Data**
- **id**: A unique identifier for each tweet.
- **text**: The text content of the tweet.
- **location**: The location from which the tweet was sent (this field may be blank).
- **keyword**: A specific keyword found in the tweet (this field may be blank).
- **target**: This attribute is present only in `train.csv`. It indicates whether a tweet is about a real disaster (1) or not (0).

### **1.2. What am I predicting?**
You are predicting whether a given tweet is about a real disaster or not. If so, predict a **1**. If not, predict a **0**.

### **1.3. Importing Libararies**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import emoji
import nltk
import spacy
import scipy.sparse
from tqdm import tqdm
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import FunctionTransformer
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,classification_report
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OrdinalEncoder, StandardScaler

### **1.4. Reading data and header view**

In [None]:
df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df.head()

### **1.5. Shape of Data**

In [None]:
df.shape

### **1.6. Description of Data**

In [None]:
df.describe(include='all').round(2).style.format(precision=2).background_gradient(cmap="Reds")

### **1.7. Info about data**

In [None]:
df.info()

### **1.8. Null values in Data**

In [None]:
df.isnull().sum()

### **1.9. Value Counts of Location**

In [None]:
df['location'].value_counts()

### **1.10. Value Counts of Keyword**

In [None]:
df['keyword'].value_counts()

### **1.11. Checking Duplicates**

In [None]:
df.duplicated().sum()

### **1.12. Checking Data Types**

In [None]:
df.dtypes

# **2. Exploratory Data Analysis**

### **2.1. Histogram of Numerical Columns**

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=len(df.select_dtypes(include='number').columns), figsize=(20, 5), tight_layout=True)

for i, col in enumerate(df.select_dtypes(include='number').columns):
    ax = axes[i]
    df[col].hist(ax=ax, color='#5DADE2', edgecolor='black', alpha=0.7)
    ax.set_title(col, fontsize=14, fontweight='bold', color='#2E4053')
    ax.set_xlabel('')
    ax.set_ylabel('')

fig.suptitle('Histograms of Numerical Columns', fontsize=20, fontweight='bold', color='#1A5276', y=1.05)
plt.subplots_adjust(top=0.85)
plt.show()

### **2.2. Boxplot of Numerical Columns**

In [None]:
sns.set_style('whitegrid')
sns.set_context('talk')

plt.figure(figsize=(15, 5))
ax = sns.boxplot(data=df, orient='h', palette='Set2')

plt.title('Box Plots of Numerical Columns', fontsize=20, fontweight='bold', color='#1A5276', pad=20)
plt.xlabel('Values', fontsize=15, fontweight='bold', color='#2E4053')
plt.ylabel('Columns', fontsize=15, fontweight='bold', color='#2E4053')

ax.tick_params(axis='x', colors='#2E4053', labelsize=12)
ax.tick_params(axis='y', colors='#2E4053', labelsize=12)

for patch in ax.artists:
    patch.set_edgecolor('#1A5276')
    patch.set_linewidth(2)

plt.show()

### **2.3. Heatmap for MCAR(Missing Completely At Random)**

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.isnull())
plt.show()

# **3. Feature Engineering**

### **3.1. Imputing Null Values**

In [None]:
df['keyword'] = df['keyword'].fillna(df['keyword'].mode()[0])
df['location'] = df['location'].fillna(value='Missing')

In [None]:
df.isnull().sum()

### **3.2. Dropping "id" irrelevant feature**

In [None]:
df.drop(columns=['id'], inplace=True)

In [None]:
df.head(5)

### **3.3. Encoding "Keyword" and "Location" columns**

In [None]:
le = LabelEncoder()

In [None]:
df['keyword'] = le.fit_transform(df['keyword'])
df['location'] = le.fit_transform(df['location'])

In [None]:
df.head(5)

# **4. NLP on text column**

### **4.1. Lowercasing**

In [None]:
df['text'] = df['text'].str.lower()

In [None]:
pd.DataFrame(df['text'].head(5))

### **4.2. Removing html tags**

In [None]:
def remove_html_tags(text):
    clean_text = re.sub('<.*?>', '', text)
    return clean_text

In [None]:
df['text'] = df['text'].apply(remove_html_tags)

In [None]:
pd.DataFrame(df['text'].head(5))

### **4.3. Removing URL's**

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    clean_text = re.sub(url_pattern, '', text)
    return clean_text

In [None]:
df['text'] = df['text'].apply(remove_urls)

In [None]:
pd.DataFrame(df['text'].head(5))

### **4.4. Removing Punctuations**

In [None]:
def remove_punctuation(text):
    punctuation = string.punctuation
    clean_text = text.translate(str.maketrans('', '', punctuation))
    return clean_text

In [None]:
df['text'] = df['text'].apply(remove_punctuation)

In [None]:
pd.DataFrame(df['text'].head(5))

### **4.5. Chat Word Treatment**

In [None]:
chat_words_mapping = {
    "lol": "laughing out loud",
    "brb": "be right back",
    "btw": "by the way",
    "afk": "away from keyboard",
    "rofl": "rolling on the floor laughing",
    "ttyl": "talk to you later",
    "np": "no problem",
    "thx": "thanks",
    "omg": "oh my god",
    "idk": "I don't know",
    "np": "no problem",
    "gg": "good game",
    "g2g": "got to go",
    "b4": "before",
    "cu": "see you",
    "yw": "you're welcome",
    "wtf": "what the f*ck",
    "imho": "in my humble opinion",
    "jk": "just kidding",
    "gf": "girlfriend",
    "bf": "boyfriend",
    "u": "you",
    "r": "are",
    "2": "to",
    "4": "for",
    "b": "be",
    "c": "see",
    "y": "why",
    "tho": "though",
    "smh": "shaking my head",
    "lolz": "laughing out loud",
    "h8": "hate",
    "luv": "love",
    "pls": "please",
    "sry": "sorry",
    "tbh": "to be honest",
    "omw": "on my way",
    "omw2syg": "on my way to see your girlfriend",
}

def expand_chat_words(text):
    words = text.split()
    expanded_words = [chat_words_mapping.get(word.lower(), word) for word in words]
    return ' '.join(expanded_words)

In [None]:
df['text'] = df['text'].apply(expand_chat_words)

In [None]:
pd.DataFrame(df['text'].head(5))

### **4.6. Removing Stop Words**

In [None]:
def remove_stop_words(text):
	tokens = nltk.word_tokenize(text)
	stop_words = set(stopwords.words('english'))
	filtered_tokens = [token for token in tokens if token not in stop_words]
	preprocessed_text = ' '.join(filtered_tokens)
	return preprocessed_text

In [None]:
df['text'] = df['text'].apply(remove_stop_words)

In [None]:
pd.DataFrame(df['text'].head(5))

### **4.7. Replacing emojis with meanings**

In [None]:
def replace_emojis_with_meanings(text):
    def replace(match):
        emoji_char = match.group()
        emoji_meaning = emoji.demojize(emoji_char)
        return emoji_meaning

    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"
                            u"\U0001F300-\U0001F5FF"
                            u"\U0001F680-\U0001F6FF"
                            u"\U0001F1E0-\U0001F1FF"
                            u"\U00002500-\U00002BEF"
                            u"\U00002702-\U000027B0"
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u"\U00010000-\U0010ffff"
                            u"\u2640-\u2642"
                            u"\u2600-\u2B55"
                            u"\u200d"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\ufe0f"
                            u"\u3030"
                            "]+", flags=re.UNICODE)
    text_with_meanings = emoji_pattern.sub(replace, text)
    return text_with_meanings

In [None]:
df['text'] = df['text'].apply(replace_emojis_with_meanings)

In [None]:
pd.DataFrame(df['text'].head(5))

### **4.8. Word Tokenization**

In [None]:
def word_tokenization(text):
    return nltk.word_tokenize(text)

In [None]:
df['token_text'] = df['text'].apply(word_tokenization)

In [None]:
pd.DataFrame(df['text'].head(5))

### **4.9. POS Tagging**

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'textcat'])

def batch_pos_tagging(texts):
    docs = list(nlp.pipe(texts, batch_size=50))
    return [[(token.text, token.pos_) for token in doc] for doc in docs]

batch_size = 50
num_batches = (len(df) + batch_size - 1) // batch_size

pos_tags = []
for i in tqdm(range(num_batches)):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(df))
    batch_texts = df['text'][start:end].tolist()
    pos_tags.extend(batch_pos_tagging(batch_texts))

df['POS_Tags'] = pos_tags

In [None]:
df.head()

In [None]:
df['token_text'] = df['token_text'].apply(lambda x: ' '.join(x))
df['POS_Tags'] = df['POS_Tags'].apply(lambda x: ' '.join([i[1] for i in x]))

In [None]:
df.head()

# **5. Modeling**

### **5.1. Transformation**

In [None]:
class TextLengthExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return pd.DataFrame(X['text'].apply(len))

class NumHashtagsExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return pd.DataFrame(X['text'].apply(lambda x: len([word for word in x.split() if word.startswith('#')])))

class NumMentionsExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return pd.DataFrame(X['text'].apply(lambda x: len([word for word in x.split() if word.startswith('@')])))

### **5.2. Train Test Split**

In [None]:
X = df[['text']]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### **5.3. Creating Pipeline for model**

In [None]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('selector', FunctionTransformer(lambda x: x['text'], validate=False)),
            ('tfidf', TfidfVectorizer(max_features=10000))
        ])),
        ('text_length', Pipeline([
            ('selector', FunctionTransformer(lambda x: x, validate=False)),
            ('extract', TextLengthExtractor())
        ])),
        ('num_hashtags', Pipeline([
            ('selector', FunctionTransformer(lambda x: x, validate=False)),
            ('extract', NumHashtagsExtractor())
        ])),
        ('num_mentions', Pipeline([
            ('selector', FunctionTransformer(lambda x: x, validate=False)),
            ('extract', NumMentionsExtractor())
        ]))
    ])),
    ('clf', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1))
])

In [None]:
param_grid = {
    'features__text__tfidf__max_features': [5000, 10000],
    'clf__n_estimators': [100, 200],
    'clf__learning_rate': [0.1, 0.01]
}

### **5.4. Applying Model**

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=1)
grid_search.fit(X_train, y_train)

### **5.5. Getting Predictions on Test Data**

In [None]:
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print(classification_report(y_test, y_pred_best))

# **6. Submission**

In [None]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_df.head()

In [None]:
test_X = test_df[['text']]

In [None]:
y_pred_submission = best_model.predict(test_X)

In [None]:
submission_df = pd.DataFrame({'id': test_df['id'], 'target': y_pred_submission})
submission_df.to_csv('submission.csv', index=False)

## **Thank You So Much**