### Introduction

In my [previous notebook](https://www.kaggle.com/code/faeghehghofrani/simple-cleaning-no-eda-ridge-classifier), I obtained a suitable model with relatively high accuracy by using a simple method to clean tweet data and evaluate 6 classification models on 2 data splitting methods.

In this notebook, using EDA techniques, data engineering and more basic data cleaning, I achieved higher accuracy than the previous notebook.

To write this document, I have used the following notebooks:
* https://www.kaggle.com/code/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert
* https://www.kaggle.com/code/vbmokin/nlp-eda-bag-of-words-tf-idf-glove-bert
* https://www.kaggle.com/code/rohitgarud/all-almost-data-preprocessing-techniques-for-nlp

### Import libraries

In [None]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
import string
import re
from scipy.sparse import hstack, coo_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report
from sklearn.linear_model import RidgeClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
warnings.filterwarnings('ignore')

import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Import Data

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
train_df.head()

### EDA

In [None]:
def Info_dataFrame(df):
    name =[x for x in globals() if globals()[x] is df][0]
    print('informaion of {}:'.format(name))
    print("--"*20)
    print(df.info())
    print("=="*20)
    print('informaion about count of Null in {}:'.format(name))
    print("--"*20)
    print(df.isnull().sum())
    print("=="*20)
    print('informaion about count of unique Value in {}:'.format(name))
    print("--"*20)
    print(df.nunique())

In [None]:
Info_dataFrame(train_df)

In [None]:
Info_dataFrame(test_df)

#### 1. Keyword Analysis

Based on the above information, the number of null cells and the number of unique values of Keyword in the train and test data are as follows:

----------------
##### train data:
----------------
All data = 7613

Null data = 61

Number of unique value = 221


----------------
##### test data:
----------------
All data = 3263

Null data = 26

Number of unique value = 221

#### 2. location Analysis
Based on the above information, the number of null cells and the number of unique values of location in the train and test data are as follows:

----------------
##### train data:
----------------
All data = 7613

Null data = 2533

Number of unique value = 3341


----------------
##### test data:
----------------
All data = 3263

Null data = 1105

Number of unique value = 1602

#### 3. text Analysis

In the analysis of tweets, we extract meta features because these features can help the model to identify fake or true tweets.

In [None]:
def text_analyze(df):
    # word_count
    df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

    # stop_word_count
    df['stop_word_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))

    # url_count
    df['url_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

    # punctuation_count
    df['punctuation_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

    # hashtag_count
    df['hashtag_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

    # mention_count
    df['mention_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

    # emoji-count
    def emoji_count(text):
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        list_emoji = emoji_pattern.findall(text)
        return len(list_emoji)

    df['emoji-count'] = df['text'].apply(emoji_count)

    return df


In [None]:
text_analyze(train_df)

In [None]:
text_analyze(test_df)

### Cleaning text

To clean tweet data, various data processing techniques have been used. The cleaning steps are as follows:

- Removing HTML
- Expand Contractions
- Removing URLs
- Removing Email IDs
- Remove emojis
- Removing Tweeter Mentions char
- Abbreviation/Acronym Disambiguation
- Removing Unicode Characters
- Removing Punctuations
- Handling Digits or Words with Digits
- Removing Stopwords
- lower case

In [None]:
pip install contractions

In [None]:
import contractions
import string
#from spellchecker import SpellChecker

def clean_tweet(text):

    # Remove_HTMLs
    new_text = re.sub(r'<.*?>',"", text)

    # Expand_Contractions
    expanded_words = []   
    for word in new_text.split():
        # using contractions.fix to expand the shortened words
        expanded_words.append(contractions.fix(word))  
   
    new_text = ' '.join(expanded_words)

    # Remove_URLs
    new_text = re.sub(r'https?://(www\.)?(\w+)(\.\w+)(/\w*)?',"", new_text)

    # Remove_Email_IDs
    new_text = re.sub(r'[\w\.-]+@[\w\.-]+\.\w+',"", new_text)

    # Remove_emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    new_text = emoji_pattern.sub(r'', new_text)

    # Remove_Tweeter_Mentions_Chars
    new_text = re.sub(r'@\w+',"", new_text)

    # Abbreviation/Acronym_Disambiguation
    new_text = re.sub(r"MH370", "Malaysia Airlines Flight 370", new_text)
    new_text = re.sub(r"mÌ¼sica", "music", new_text)
    new_text = re.sub(r"okwx", "Oklahoma City Weather", new_text)
    new_text = re.sub(r"arwx", "Arkansas Weather", new_text)    
    new_text = re.sub(r"gawx", "Georgia Weather", new_text)  
    new_text = re.sub(r"scwx", "South Carolina Weather", new_text)  
    new_text = re.sub(r"cawx", "California Weather", new_text)
    new_text = re.sub(r"tnwx", "Tennessee Weather", new_text)
    new_text = re.sub(r"azwx", "Arizona Weather", new_text)  
    new_text = re.sub(r"alwx", "Alabama Weather", new_text)
    new_text = re.sub(r"wordpressdotcom", "wordpress", new_text)    
    new_text = re.sub(r"usNWSgov", "United States National Weather Service", new_text)
    new_text = re.sub(r"Suruc", "Sanliurfa", new_text)

    new_text = re.sub(r"&gt;", ">", new_text)
    new_text = re.sub(r"&lt;", "<", new_text)
    new_text = re.sub(r"&amp;", "&", new_text)

    # Remove_Unicode_Characters
    new_text = new_text.encode("ascii", "ignore").decode()

    # Remove_Punctuations
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    for p in punctuations:
        new_text = re.sub(re.escape(p), " ",new_text)

    # Remove_Digits
    new_text = re.sub(r'\w*\d+\w*', "",new_text)

    # Remove_Stopwords
    new_text = " ".join([word for word in str(new_text).split() if word not in stopwords.words('english')])

    # Lower_case
    new_text = new_text.lower()
    
    return new_text

In [None]:
train_df['clean_text'] = train_df['text'].apply(clean_tweet)
train_df

In [None]:
test_df['clean_text'] = test_df['text'].apply(clean_tweet)
test_df

### Model

According to the result of the [previous notebook](https://www.kaggle.com/code/faeghehghofrani/simple-cleaning-no-eda-ridge-classifier), in this document we also use the tfidf method for tweets, and then by adding other meta data to the obtained matrix and finding the best classification model, we train and test the model.

In [None]:
# Convert the text column to a matrix of TF-IDF features

tfidf = TfidfVectorizer()
text_matrix = tfidf.fit_transform(train_df['clean_text'])

# Concatenate the TF-IDF matrix with the other columns

arr = np.array(train_df[['word_count', 'stop_word_count', 'url_count', 'punctuation_count', 'hashtag_count', 'mention_count', 'emoji-count']].values)
M = coo_matrix(arr)
text_matrix_full = hstack((text_matrix, M))

text_matrix_full

In [None]:
def split_train(df, model):
    tfidf = TfidfVectorizer()
    text_matrix = tfidf.fit_transform(df['clean_text'])
    arr = np.array(df[['word_count', 'stop_word_count', 'url_count', 'punctuation_count', 'hashtag_count', 'mention_count', 'emoji-count']].values)
    M = coo_matrix(arr)
    text_matrix_full = hstack((text_matrix, M))
    X = text_matrix.toarray()
    y = df['target'].values
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.3, random_state= 42, stratify=y)

    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    accuracy = round(accuracy_score(y_test,y_pred),3)
    
    print(f'Accuracy: {np.round(accuracy*100,2)}%')
    print('='*50)

In [None]:
models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier(), GaussianNB(), RidgeClassifier()]
print("Models Accuracy with TFIDF split:")
for mod in models:
    print(f'Model :{mod}')
    split_train(train_df, mod)

In [None]:
tf = TfidfVectorizer()
train_Vector = tf.fit_transform(train_df['clean_text'])
arr = np.array(train_df[['word_count', 'stop_word_count', 'url_count', 'punctuation_count', 'hashtag_count', 'mention_count', 'emoji-count']].values)
M = coo_matrix(arr)
train_Vector_full = hstack((train_Vector, M))

model = RidgeClassifier()
model.fit(train_Vector_full, train_df["target"])

In [None]:
test_vector = tf.transform(test_df['clean_text'])
arr_test = np.array(test_df[['word_count', 'stop_word_count', 'url_count', 'punctuation_count', 'hashtag_count', 'mention_count', 'emoji-count']].values)
M_test = coo_matrix(arr_test)
test_vector_full = hstack((test_vector, M_test))
test_vector_full

In [None]:
model.predict(test_vector_full)

In [None]:
sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
sub.head()

In [None]:
submission = pd.DataFrame({"id":test_df["id"],"target":model.predict(test_vector_full)})

In [None]:
submission

In [None]:
submission.to_csv("submission.csv",index = False)