# 1. Importing Libraries and Resources

In [7]:
import pandas as pd
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer  # For handling missing values
from imblearn.over_sampling import SMOTE  # For handling class imbalance

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet', download_dir='/content/nltk_data/')
nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DEll\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DEll\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DEll\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /content/nltk_data/...
[nltk_data]   Package wordnet is already up-to-date!


<nltk.tokenize.punkt.PunktTokenizer at 0x21a0d7ddd90>

# 2.Data Loading and Preprocessing

In [8]:
# 1. Data Loading and Preprocessing
df = pd.read_csv(r"C:\Users\DEll\Documents\Ml_Projects\tweet_dis_nlp\train.csv")

# Handling missing values with imputation
imputer = SimpleImputer(strategy='most_frequent')  # Replace with your preferred strategy
df[['keyword', 'location']] = imputer.fit_transform(df[['keyword', 'location']])

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphanumeric characters
        text = text.lower()
        tokens = nltk.word_tokenize(text)
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Lemmatization
        return tokens
    else:
        return []

df['processed_text'] = df['text'].apply(preprocess_text)

# 3.Word2Vec Embedding with Hyperparameter Tuning

In [9]:
# 2. Word2Vec Embedding with Hyperparameter Tuning
corpus = df['processed_text'].tolist()

# Hyperparameter tuning for Word2Vec (example)
word2vec_params = {
    'vector_size': [100, 200, 300],
    'window': [5, 7, 9],
    'min_count': [1, 3, 5]
}

# Use GridSearchCV to find the best hyperparameters for Word2Vec
# ... (Code for GridSearchCV with Word2Vec) ...

# After tuning, create the Word2Vec model with the best parameters
model = Word2Vec(corpus, vector_size=200, window=7, min_count=3, workers=4, sg=1)  # Replace with best parameters

# 4.Feature Extraction

In [10]:
# 3. Feature Extraction
def get_document_vector(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

df['document_vector'] = df['processed_text'].apply(lambda tokens: get_document_vector(tokens, model))

# 5.Model Training and Evaluation

In [11]:
# 4. Model Training and Evaluation
X = np.array(df['document_vector'].tolist())
y = df['target']

# Addressing class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Hyperparameter tuning for RandomForestClassifier (example)
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10]
}

# Use GridSearchCV to find the best hyperparameters for RandomForestClassifier
# ... (Code for GridSearchCV with RandomForestClassifier) ...

# After tuning, create the RandomForestClassifier model with the best parameters
classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # Replace with best parameters
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Evaluate performance with multiple metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.7737478411053541
Precision: 0.8099489795918368
Recall: 0.7224118316268487
F1-score: 0.7636800962116657


# 6.Example Prediction

In [12]:
# Example new tweet for prediction:
new_tweet = "There's a huge fire near my house, and people are evacuating!"

# Preprocess the new tweet:
processed_tweet = preprocess_text(new_tweet)
tweet_vector = get_document_vector(processed_tweet, model)

# Make the prediction:
prediction = classifier.predict([tweet_vector])[0]  # Get the prediction (0 or 1)

# Print the prediction:
if prediction == 1:
    print(f"Tweet: '{new_tweet}' is predicted as a real disaster.")
else:
    print(f"Tweet: '{new_tweet}' is predicted as a fake disaster.")

Tweet: 'There's a huge fire near my house, and people are evacuating!' is predicted as a real disaster.
