In [373]:
import pandas as pd
import numpy as np

In [374]:
pd.set_option('display.float_format',lambda x: '{: .2f}'.format(x)) 
np.set_printoptions(suppress=True) 
pd.set_option("display.max_columns",None) 
pd.set_option("display.width",2000) 

In [375]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [376]:
df.shape

(7613, 5)

In [377]:
df.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [378]:
df.duplicated().sum()

0

In [379]:
df[df.location=='MontrÌ©al, QuÌ©bec']

Unnamed: 0,id,keyword,location,text,target
2594,3723,destroyed,"MontrÌ©al, QuÌ©bec",The grenade sound effect on 'Impossible' just ...,0


In [380]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [381]:
df['keyword_nan'] = df['keyword'].isna().astype(int)
df['location_nan'] = df['location'].isna().astype(int)

df['location']=df['location'].fillna('null')
df['keyword']=df['keyword'].fillna('null')

df.head()

Unnamed: 0,id,keyword,location,text,target,keyword_nan,location_nan
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,1
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,1
2,5,,,All residents asked to 'shelter in place' are ...,1,1,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,1,1


In [382]:
for col in ['keyword', 'location', 'text']:
    df[col] = df[col].str.lower()


In [383]:
df.head()

Unnamed: 0,id,keyword,location,text,target,keyword_nan,location_nan
0,1,,,our deeds are the reason of this #earthquake m...,1,1,1
1,4,,,forest fire near la ronge sask. canada,1,1,1
2,5,,,all residents asked to 'shelter in place' are ...,1,1,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,1
4,7,,,just got sent this photo from ruby #alaska as ...,1,1,1


In [384]:
import re

def count_mojibake(text):
    if pd.isna(text):
        return 0
    return len(re.findall(r'[^\x00-\x7F]', text))  

df['mojibake_freq'] = df['text'].apply(count_mojibake).astype(int)


In [385]:
def has_link(text):
    if pd.isna(text):
        return False
    return bool(re.search(r'(http[s]?://|www\.)', text, re.IGNORECASE))

df['has_link'] = df['text'].apply(has_link).astype(bool)
df['has_link']=df['has_link'].astype(int)

In [386]:
df['text_hashtags'] = df['text'].str.count(r'#\w+')
df['text_mentions'] = df['text'].str.count(r'@\w+')
df['text_exclamations'] = df['text'].str.count(r'!')
df['text_questions'] = df['text'].str.count(r'\?')

In [387]:
df.head()

Unnamed: 0,id,keyword,location,text,target,keyword_nan,location_nan,mojibake_freq,has_link,text_hashtags,text_mentions,text_exclamations,text_questions
0,1,,,our deeds are the reason of this #earthquake m...,1,1,1,0,0,1,0,0,0
1,4,,,forest fire near la ronge sask. canada,1,1,1,0,0,0,0,0,0
2,5,,,all residents asked to 'shelter in place' are ...,1,1,1,0,0,0,0,0,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,1,0,0,1,0,0,0
4,7,,,just got sent this photo from ruby #alaska as ...,1,1,1,0,0,2,0,0,0


In [388]:
import unicodedata
def is_suspicious_char(c):
    if ord(c) < 128:
        return c.isdigit()
    try:
        cat = unicodedata.category(c)
        if cat[0] not in {'L', 'Z', 'P'}:
            return True
        return False
    except:
        return True

def has_suspicious_location(text):
    if pd.isna(text):
        return 0
    for char in text:
        if is_suspicious_char(char):
            return 1
    if re.search(r"[^a-zA-Z\s'’\-éèêàçñöüïÉÈÊÀÇÑÖÜÏ]", text):
        return 1
    return 0

df['location_spec'] = df['location'].apply(has_suspicious_location)


In [389]:
for i in ['keyword', 'location', 'target', 'keyword_nan', 'location_nan', 'has_link', 'location_spec']:
    print(df[i].value_counts())
    print(50*"*")

keyword
null                     61
fatalities               45
deluge                   42
armageddon               42
sinking                  41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 222, dtype: int64
**************************************************
location
null                   2533
usa                     104
new york                 75
united states            50
london                   49
                       ... 
montreal                  1
ìït: 6.4682,3.18287       1
live4heed??               1
waco, texas               1
lincoln                   1
Name: count, Length: 3234, dtype: int64
**************************************************
target
0    4342
1    3271
Name: count, dtype: int64
**************************************************
keyword_nan
0    7552
1      61
Name: count, dtype: int64
************************

In [390]:
from nltk.tokenize import sent_tokenize,word_tokenize

df['text_char'] = df['text'].apply(len)

df['text_words'] = df['text'].apply(lambda x: len(word_tokenize(x)))

df['text_sent'] = df['text'].apply(lambda x: len(sent_tokenize(x)))

In [391]:
for i in ['text_char', 'text_words', 'text_sent','mojibake_freq','text_hashtags','text_mentions','text_questions','text_exclamations']:
    print(i)
    print(df[i].describe())
    print(50*"*")

text_char
count    7613.00
mean      101.04
std        33.78
min         7.00
25%        78.00
50%       107.00
75%       133.00
max       157.00
Name: text_char, dtype: float64
**************************************************
text_words
count    7613.00
mean       18.91
std         6.87
min         1.00
25%        14.00
50%        19.00
75%        24.00
max        72.00
Name: text_words, dtype: float64
**************************************************
text_sent
count    7613.00
mean        1.55
std         0.95
min         1.00
25%         1.00
50%         1.00
75%         2.00
max        22.00
Name: text_sent, dtype: float64
**************************************************
mojibake_freq
count    7613.00
mean        0.30
std         1.15
min         0.00
25%         0.00
50%         0.00
75%         0.00
max        18.00
Name: mojibake_freq, dtype: float64
**************************************************
text_hashtags
count    7613.00
mean        0.44
std         1.09
min      

In [392]:
from collections import Counter

def repetition_score(text):
    words = text.lower().split()
    total_words = len(words)
    if total_words == 0:
        return 0
    word_counts = Counter(words)
    repeated = sum([count for count in word_counts.values() if count > 1])
    return repeated / total_words  

df['repetition_score'] = df['text'].fillna('').apply(repetition_score)

def max_word_freq(text):
    words = text.lower().split()
    if not words:
        return 0
    word_counts = Counter(words)
    return max(word_counts.values()) / len(words)

df['max_word_repeat_ratio'] = df['text'].fillna('').apply(max_word_freq)


In [393]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) - {'no', 'not', 'nor', 'never'} 

def transformtext(text):
    if pd.isna(text):
        return ""
    
    tokens = word_tokenize(text)
    
    clean_tokens = []
    for word in tokens:
        word_lower = word.lower()
        if (word_lower in stop_words):
            continue
        if re.match(r'^[a-zA-Z0-9]+$', word):  
            clean_tokens.append(lemmatizer.lemmatize(word_lower))
        elif re.match(r'^#\w+$', word):        
            clean_tokens.append(word_lower)
        elif re.match(r'^@\w+$', word):        
            clean_tokens.append(word_lower)
        elif word in {'!', '?'}:               
            clean_tokens.append(word)

    return " ".join(clean_tokens)


In [394]:
df['text']=df['text'].apply(transformtext)

In [395]:
df.head()

Unnamed: 0,id,keyword,location,text,target,keyword_nan,location_nan,mojibake_freq,has_link,text_hashtags,text_mentions,text_exclamations,text_questions,location_spec,text_char,text_words,text_sent,repetition_score,max_word_repeat_ratio
0,1,,,deed reason earthquake may allah forgive u,1,1,1,0,0,1,0,0,0,0,69,14,1,0.0,0.08
1,4,,,forest fire near la ronge sask canada,1,1,1,0,0,0,0,0,0,0,38,8,2,0.0,0.14
2,5,,,resident asked place notified officer no evacu...,1,1,1,0,0,0,0,0,0,0,133,24,2,0.18,0.09
3,6,,,people receive wildfire evacuation order calif...,1,1,1,0,0,1,0,0,0,0,65,9,1,0.0,0.12
4,7,,,got sent photo ruby alaska smoke wildfire pour...,1,1,1,0,0,2,0,0,0,0,88,18,1,0.12,0.12


In [396]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# features = [
#     'keyword_nan', 'location_nan', 'mojibake_freq',
#     'has_link', 'text_hashtags', 'text_mentions', 'text_exclamations',
#     'text_questions', 'location_spec', 'text_char', 'text_words', 'text_sent'
# ]

# cols = 4
# rows = (len(features) + cols - 1) // cols

# plt.figure(figsize=(20, 5 * rows))

# for i, feature in enumerate(features, 1):
#     plt.subplot(rows, cols, i)
#     if df[feature].dtype == 'object':
#         sns.countplot(data=df, x=feature, hue='target')
#         plt.xticks(rotation=45)
#     else:
#         sns.boxplot(data=df, x='target', y=feature)

#     plt.title(f'{feature} vs target')
#     plt.tight_layout()

# plt.show()


In [397]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.25, random_state=42, stratify=df['target'])

In [398]:
X_train = df_train.drop(columns=['target'],axis=1)
X_test = df_test.drop(columns=['target'],axis=1)

y_train = df_train['target']
y_test = df_test['target']


In [399]:
features = ['keyword_nan', 'location_nan', 'mojibake_freq', 'has_link', 'text_hashtags', 'text_mentions', 'text_exclamations', 'text_questions', 'location_spec', 'text_char', 'text_words', 'text_sent']

In [400]:
from scipy.sparse import hstack

tfidf_keyword = TfidfVectorizer()
tfidf_location = TfidfVectorizer()
tfidf_text = TfidfVectorizer(max_features=2000)  

X_train_keyword_tfidf = tfidf_keyword.fit_transform(X_train['keyword'])
X_train_location_tfidf = tfidf_location.fit_transform(X_train['location'])
X_train_text_tfidf = tfidf_text.fit_transform(X_train['text'])

X_test_keyword_tfidf = tfidf_keyword.transform(X_test['keyword'])
X_test_location_tfidf = tfidf_location.transform(X_test['location'])
X_test_text_tfidf = tfidf_text.transform(X_test['text'])

X_train_tfidf = hstack([X_train_keyword_tfidf, X_train_location_tfidf, X_train_text_tfidf])
X_test_tfidf = hstack([X_test_keyword_tfidf, X_test_location_tfidf, X_test_text_tfidf])

X_train_extra = np.hstack([X_train[col].values.reshape(-1, 1) for col in features])
X_test_extra = np.hstack([X_test[col].values.reshape(-1, 1) for col in features])

X_train_final = hstack([X_train_tfidf, X_train_extra])
X_test_final = hstack([X_test_tfidf, X_test_extra])



In [401]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=0.0)

X_train_final_binarized = binarizer.fit_transform(X_train_final)
X_test_final_binarized = binarizer.transform(X_test_final)


In [402]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

bnb = BernoulliNB()

bnb.fit(X_train_final_binarized, y_train)

y_pred_bnb = bnb.predict(X_test_final_binarized)

print("Accuracy:", accuracy_score(y_test, y_pred_bnb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_bnb))

Accuracy: 0.8177521008403361

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85      1086
           1       0.83      0.73      0.77       818

    accuracy                           0.82      1904
   macro avg       0.82      0.81      0.81      1904
weighted avg       0.82      0.82      0.82      1904



In [403]:
mnb = MultinomialNB()

mnb.fit(X_train_final, y_train)

y_pred_mnb = mnb.predict(X_test_final)

print("Accuracy:", accuracy_score(y_test, y_pred_mnb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_mnb))

Accuracy: 0.7904411764705882

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.83      0.82      1086
           1       0.76      0.74      0.75       818

    accuracy                           0.79      1904
   macro avg       0.79      0.78      0.79      1904
weighted avg       0.79      0.79      0.79      1904



In [404]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb.fit(X_train_final, y_train)

y_pred_xgb = xgb.predict(X_test_final)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))


XGBoost Accuracy: 0.7878151260504201

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.86      0.82      1086
           1       0.79      0.69      0.74       818

    accuracy                           0.79      1904
   macro avg       0.79      0.78      0.78      1904
weighted avg       0.79      0.79      0.79      1904



In [405]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,       
    random_state=42,
    n_jobs=-1             
)

rf.fit(X_train_final, y_train)

y_pred_rf = rf.predict(X_test_final)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.789390756302521

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.91      0.83      1086
           1       0.84      0.62      0.72       818

    accuracy                           0.79      1904
   macro avg       0.80      0.77      0.78      1904
weighted avg       0.80      0.79      0.78      1904

