In [None]:
pip install pandas numpy nltk




In [None]:
pip install pandas numpy nltk scikit-learn gensim tqdm



Load the Dataset

In [None]:
import pandas as pd

# Load your dataset
file_path = 'final.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(df.head())


   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


Pre-processing and Cleaning of Dataset


In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords and wordnet data
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset
file_path = 'final.csv'  # Replace with your file name
df = pd.read_csv(file_path)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@w+|\#','', text)  # Remove mentions and hashtags
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)  # Remove special characters
    text = text.strip()  # Remove leading/trailing whitespace
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatize words
    return text

# Apply preprocessing to the dataset
df['clean_text'] = df['text'].apply(preprocess_text)

# Remove the original 'text' column
df.drop(columns=['text'], inplace=True)

# Save cleaned dataset to a new CSV file with the specified name
cleaned_file_name = 'cleaned_dataset.csv'
df.to_csv(cleaned_file_name, index=False)

print(f"Cleaned dataset saved as {cleaned_file_name}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Cleaned dataset saved as cleaned_dataset.csv


In [None]:
print(df.head(2000))

        id keyword                      location  target  \
0        1     NaN                           NaN       1   
1        4     NaN                           NaN       1   
2        5     NaN                           NaN       1   
3        6     NaN                           NaN       1   
4        7     NaN                           NaN       1   
...    ...     ...                           ...     ...   
1995  2869  damage                         Texas       1   
1996  2870  damage  Lawrence, KS via Emporia, KS       1   
1997  2871  damage     http://twitch.tv/jcmonkey       1   
1998  2872  damage                     Indonesia       0   
1999  2873  damage                       Unknown       1   

                                             clean_text  
0            deed reason earthquake may allah forgive u  
1                 forest fire near la ronge sask canada  
2     resident asked shelter place notified officer ...  
3     13 000 people receive wildfire evacuation

Convert Text Data into Numerical Representations using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import pickle

# Load the cleaned dataset
cleaned_file_path = 'cleaned_dataset.csv'
clean_df = pd.read_csv(cleaned_file_path)

# Handle missing values in 'clean_text' column
# Fill missing values with an empty string
clean_df['clean_text'] = clean_df['clean_text'].fillna('')

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(clean_df['clean_text'])

# Save the TF-IDF features
with open('tfidf_features.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

# Assuming 'target' is the column with labels
# Replace 'target' with the actual column name if different
y = clean_df['target']

In [None]:
# Display TF-IDF features
print("TF-IDF Features:")
print(X_tfidf)


TF-IDF Features:
  (0, 134)	0.27527693377750867
  (0, 4018)	0.23045338752214425
  (0, 4751)	0.2527886727505137
  (0, 2224)	0.4326583059755764
  (0, 3063)	0.24966018217734523
  (0, 3351)	0.3411902865636733
  (0, 4277)	0.2798805151335151
  (0, 4450)	0.2933586974934953
  (0, 510)	0.2982507323111681
  (0, 4779)	0.26743366613562625
  (0, 944)	0.3411902865636733
  (1, 3445)	0.2855374008727737
  (1, 14)	0.24736121699873692
  (1, 2026)	0.2440775482230953
  (1, 4664)	0.19316262548458898
  (1, 1534)	0.2963940574538363
  (1, 861)	0.3072507140348989
  (1, 20)	0.22575526773324223
  (1, 16)	0.24178967295228518
  (1, 2459)	0.45651082499719753
  (1, 3974)	0.31169565548814415
  (1, 4450)	0.2832495256019636
  (1, 510)	0.2879729803799315
  (2, 734)	0.45356960385565914
  (2, 351)	0.5542411194286299
  :	:
  (11366, 987)	0.34910369064155694
  (11366, 1106)	0.3252959314000937
  (11366, 2097)	0.3252959314000937
  (11366, 1712)	0.27118283447629454
  (11366, 272)	0.19232571023645115
  (11367, 4948)	0.3298271490

 Explore and Integrate Word Embedding Techniques using Word2Vec

In [None]:
from gensim.models import Word2Vec
import numpy as np # Import numpy

# Prepare the dataset for Word2Vec
sentences = [row.split() for row in clean_df['clean_text']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save the Word2Vec model
word2vec_model.save("word2vec.model") # Use the 'save' method to store the model

In [None]:
from gensim.models import Word2Vec
import numpy as np

# Prepare the dataset for Word2Vec
sentences = [row.split() for row in clean_df['clean_text']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Print some information about the Word2Vec model
print("Word2Vec Model Information:")
print("Vocabulary Size:", len(word2vec_model.wv))
print("Word Vector Dimensionality:", word2vec_model.wv.vector_size)
print("Most similar words to 'disaster':", word2vec_model.wv.most_similar('disaster'))


Word2Vec Model Information:
Vocabulary Size: 15590
Word Vector Dimensionality: 100
Most similar words to 'disaster': [('day', 0.9955389499664307), ('u', 0.9954074025154114), ('like', 0.9953625798225403), ('2', 0.9950570464134216), ('4', 0.9950290322303772), ('via', 0.9949931502342224), ('3', 0.9949022531509399), ('amp', 0.9948927760124207), ('flood', 0.9946916103363037), ('5', 0.9946828484535217)]


Select and Compare Machine Learning Algorithms
python


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Define the classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'Multinomial Naive Bayes': MultinomialNB()
}

# Train and evaluate each classifier
best_classifier = None
best_accuracy = 0

for clf_name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Test Accuracy: {accuracy:.4f}")

    # Update best classifier if necessary
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_classifier = clf_name

print(f"\nBest Classifier: {best_classifier} with Accuracy: {best_accuracy:.4f}")


Logistic Regression Test Accuracy: 0.8148
Random Forest Test Accuracy: 0.7991
Support Vector Machine Test Accuracy: 0.8135
Multinomial Naive Bayes Test Accuracy: 0.8326

Best Classifier: Multinomial Naive Bayes with Accuracy: 0.8326


Training dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Load the preprocessed dataset
data = pd.read_csv('cleaned_dataset.csv')  # Ensure this CSV has clean_text and target columns

In [None]:
# Separate features and target variable
X = data['clean_text']  # Assuming 'clean_text' column contains preprocessed tweets
y = data['target']      # Assuming 'target' column contains the labels

In [None]:

# Split the data into training and temporary set (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Split the temporary set into validation and testing sets (50% val, 50% test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [None]:
# Handle missing values in 'clean_text' column before vectorization
X_train = X_train.fillna('')  # Replace NaN values with empty strings

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [None]:
# Transform the validation and test data, handling missing values
X_val = X_val.fillna('')  # Replace NaN values in X_val with empty strings
X_test = X_test.fillna('') # Replace NaN values in X_test with empty strings
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Initialize the model
rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Train the model
rf_model.fit(X_train_tfidf, y_train)

In [None]:

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)


In [None]:
# Fit GridSearchCV
grid_search.fit(X_train_tfidf, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')


Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [None]:

# Train the model with the best parameters
best_rf_model = grid_search.best_estimator_


In [None]:

# Predict on the validation set
y_val_pred = best_rf_model.predict(X_val_tfidf)


In [None]:
# Calculate metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred, average='weighted')
val_recall = recall_score(y_val, y_val_pred, average='weighted')


In [None]:
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation Precision: {val_precision}')
print(f'Validation Recall: {val_recall}')


Validation Accuracy: 0.8028909329829172
Validation Precision: 0.8010900978613094
Validation Recall: 0.8028909329829172


In [None]:
# Detailed classification report
print(classification_report(y_val, y_val_pred))


              precision    recall  f1-score   support

           0       0.82      0.87      0.84       467
           1       0.77      0.69      0.73       294

    accuracy                           0.80       761
   macro avg       0.80      0.78      0.79       761
weighted avg       0.80      0.80      0.80       761



In [None]:

# Predict on the test set
y_test_pred = best_rf_model.predict(X_test_tfidf)


In [None]:
# Calculate metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')

print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')


Test Accuracy: 0.7979002624671916
Test Precision: 0.7960952097374383
Test Recall: 0.7979002624671916


In [None]:
# Detailed classification report
print(classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

           0       0.82      0.86      0.84       470
           1       0.75      0.70      0.73       292

    accuracy                           0.80       762
   macro avg       0.79      0.78      0.78       762
weighted avg       0.80      0.80      0.80       762



In [None]:
# Function to classify new tweets
def classify_tweet(tweet):
    # Assuming the tweet is already preprocessed
    tweet_tfidf = tfidf_vectorizer.transform([tweet])

    # Predict using the trained model
    prediction = best_rf_model.predict(tweet_tfidf)

    return 'Disaster' if prediction == 1 else 'Not Disaster'


In [None]:
import joblib
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

In [None]:
joblib.dump(rf_model, 'random_forest_model.joblib')

['random_forest_model.joblib']

In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4

In [None]:
!wget -q -O - ipv4.icanhazip.com

34.145.204.161


In [None]:
! streamlit run ui.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.145.204.161:8501[0m
[0m
[K[?25hnpx: installed 22 in 3.773s
your url is: https://honest-pumas-cheat.loca.lt
2024-07-05 12:55:03.986 `label` got an empty value. This is discouraged for accessibility reasons and may be disallowed in the future by raising an exception. Please provide a non-empty label and hide it with label_visibility if needed.
2024-07-05 12:55:09.792 `label` got an empty value. This is discouraged for accessibility reasons and may be disallowed in the future by raising an exception. Please provide a non-empty label and hide it with label_visibility if needed.
2024-07-05 12:57:48.863 `label` got an empty value. This is discouraged for accessibility reasons and

In [None]:
pip install --upgrade streamlit



In [None]:
!pip install streamlit joblib tweepy
