In [79]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
import joblib

# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the datasets
train_file = 'tweets.csv'
data = pd.read_csv(train_file)

# Preprocess the data
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove punctuation and numbers
    tokens = nltk.word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return tokens

data['cleaned_text'] = data['text'].apply(preprocess_text)

# Word2Vec embeddings
sentences = data['cleaned_text'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(sentences, total_examples=len(sentences), epochs=10)

def get_word2vec_embeddings(tokens):
    valid_tokens = [word for word in tokens if word in word2vec_model.wv]
    if valid_tokens:
        embedding = np.mean([word2vec_model.wv[word] for word in valid_tokens], axis=0)
    else:
        embedding = np.zeros(100)
    return embedding

data['embeddings'] = data['cleaned_text'].apply(get_word2vec_embeddings)

X = np.vstack(data['embeddings'].values)

# Assuming 'target' is the column name for labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['target'])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model without hyperparameter tuning
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_val_pred = rf.predict(X_val)

# Print classification metrics
print("Classification Report (Validation):")
print(classification_report(y_val, y_val_pred))
print(f"Accuracy (Validation): {accuracy_score(y_val, y_val_pred)}")

# Save the model and necessary objects
model_file = 'best_model.pkl'
joblib.dump(rf, model_file)

word2vec_model_file = 'word2vec_model.pkl'
word2vec_model.save(word2vec_model_file)

label_encoder_file = 'label_encoder.pkl'
joblib.dump(label_encoder, label_encoder_file)

# Save the cleaned tweets into a new CSV file
cleaned_tweets_file = 'cleaned_tweets.csv'
data[['text', 'cleaned_text']].to_csv(cleaned_tweets_file, index=False)

print(f"Cleaned tweets saved to {cleaned_tweets_file}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1866
           1       0.77      0.50      0.61       449

    accuracy                           0.87      2315
   macro avg       0.83      0.73      0.77      2315
weighted avg       0.87      0.87      0.86      2315

Accuracy (Validation): 0.8734341252699784
Cleaned tweets saved to cleaned_tweets.csv


In [80]:
from sklearn.model_selection import RandomizedSearchCV

# Define a smaller parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt']
}

# Initialize the random forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV with fewer iterations
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Perform random search
rf_random.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:")
print(rf_random.best_params_)

# Use the best estimator to predict on validation set
best_rf = rf_random.best_estimator_
y_val_pred_tuned = best_rf.predict(X_val)

# Print classification metrics after tuning
print("Classification Report after Tuning (Validation):")
print(classification_report(y_val, y_val_pred_tuned))
print(f"Accuracy after Tuning (Validation): {accuracy_score(y_val, y_val_pred_tuned)}")

# Save the best model
best_model_file = 'best_rf_model.pkl'
joblib.dump(best_rf, best_model_file)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


  warn(


Best Parameters:
{'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 20}
Classification Report after Tuning (Validation):
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1866
           1       0.76      0.50      0.60       449

    accuracy                           0.87      2315
   macro avg       0.83      0.73      0.76      2315
weighted avg       0.86      0.87      0.86      2315

Accuracy after Tuning (Validation): 0.8730021598272139


['best_rf_model.pkl']

In [89]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import joblib
from gensim.models import Word2Vec

# Function to get Word2Vec embeddings
def get_word2vec_embeddings(tokens):
    if isinstance(tokens, list):
        valid_tokens = [word for word in tokens if word in word2vec_model.wv]
        if valid_tokens:
            embedding = np.mean([word2vec_model.wv[word] for word in valid_tokens], axis=0)
        else:
            embedding = np.zeros(100)
    else:
        embedding = np.zeros(100)
    return embedding

# Load the saved Word2Vec model
word2vec_model = Word2Vec.load('word2vec_model.pkl')

# Load the saved Random Forest model
best_rf = joblib.load('best_rf_model.pkl')

# Load the saved label encoder
label_encoder = joblib.load('label_encoder.pkl')

# Load the new dataset
test_file = 'test.csv'  # Replace with your actual test file
test_data = pd.read_csv(test_file)

# Ensure that 'cleaned_text' column exists and is preprocessed
if 'cleaned_text' not in test_data.columns:
    raise ValueError("The 'cleaned_text' column is missing from the dataset.")

# Create embeddings for the preprocessed text
test_data['embeddings'] = test_data['cleaned_text'].apply(get_word2vec_embeddings)
X_test = np.vstack(test_data['embeddings'].values)

# Encode the target labels
y_test = label_encoder.transform(test_data['target'])

# Predict the target using the loaded model
y_test_pred = best_rf.predict(X_test)

# Print classification metrics
print("Classification Report (Test):")
print(classification_report(y_test, y_test_pred))
print(f"Accuracy (Test): {accuracy_score(y_test, y_test_pred)}")


Classification Report (Test):
              precision    recall  f1-score   support

           0       0.58      1.00      0.73      4401
           1       0.00      0.00      0.00      3212

    accuracy                           0.58      7613
   macro avg       0.29      0.50      0.37      7613
weighted avg       0.33      0.58      0.42      7613

Accuracy (Test): 0.5780901090240378


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [90]:
# Test the model with a single sentence
def test_single_sentence(sentence):
    tokens = preprocess_text(sentence)
    embedding = get_word2vec_embeddings(tokens)
    embedding = embedding.reshape(1, -1)
    prediction = best_rf.predict(embedding)
    return prediction

# Example test with the provided test data
test_sentences = [
    "This is a test tweet to check the model prediction.",
    "Another example sentence for prediction testing.",
    "The rain in Spain falls mainly on the plain.",
    "the Tsunami killed thousands.",
    "the thunderstorm hit our neighbour's house",
    "a house flew in the air due to whirlwind this afternoon",
    "The first train to cross the world's highest railway bridge-the Chenab bridge in india",
    "Since it's yoga day, it's the perfect image to signify that our infrastructure is stretching itself as far towardsthe skies as possible ",
    "Massive fire has broken out near the oak street bridge in richmond of british colombia",
    "Firefighters are working tirelessly to contain a fire at a local business. Support them by avoiding the area.",
    "Severe thunderstorm warning in effect. Stay indoors and avoid travel if possible."
    "Recovery and rebuilding after the tsunami will require a global effort. Let's stand together.",
    "Tornado spotted! Take cover immediately and follow emergency instructions.",
    "Hurricane making landfall with strong winds and heavy rain. Evacuate if advised and stay indoors.",
    "A strong earthquake has just struck. Check on your loved ones and follow safety protocols.",
    "Smoky skies and blazing fires. Our thoughts are with everyone in the path of the wildfires.",
    "Inundated streets and rising waters. Thoughts are with everyone affected by the floods.",
    "Downed trees and power lines reported due to the storm. Stay away from any fallen wires.",
    "Had an amazing dinner with friends last night.",
    "Just finished a great workout at the gym!",
    "Had an amazing dinner with friends last night.",
    "Enjoying a relaxing day at the beach.",
    "Watching a new movie on Netflix tonight.",
    "Just adopted a new puppy, can't wait to bring him home!",
    "Reading a fantastic book on my day off.",
    "Exploring the city and finding new coffee shops.",
    "Attending a concert this weekend, so excited!",
    "Spent the day gardening, love being outdoors.",
    "Catching up on some much-needed sleep today."

]

for sentence in test_sentences:
    prediction = test_single_sentence(sentence)
    print(f"Prediction for the test sentence '{sentence}': {prediction}")

Prediction for the test sentence 'This is a test tweet to check the model prediction.': [0]
Prediction for the test sentence 'Another example sentence for prediction testing.': [0]
Prediction for the test sentence 'The rain in Spain falls mainly on the plain.': [0]
Prediction for the test sentence 'the Tsunami killed thousands.': [1]
Prediction for the test sentence 'the thunderstorm hit our neighbour's house': [1]
Prediction for the test sentence 'a house flew in the air due to whirlwind this afternoon': [1]
Prediction for the test sentence 'The first train to cross the world's highest railway bridge-the Chenab bridge in india': [0]
Prediction for the test sentence 'Since it's yoga day, it's the perfect image to signify that our infrastructure is stretching itself as far towardsthe skies as possible ': [0]
Prediction for the test sentence 'Massive fire has broken out near the oak street bridge in richmond of british colombia': [1]
Prediction for the test sentence 'Firefighters are wor

In [49]:

!pip install streamlit
!pip install pyngrok
!pip install joblib
!pip install numpy
!pip install nltk
!pip install gensim
!pip install requests
!pip install beautifulsoup4




In [91]:
%%writefile app.py
import streamlit as st
import joblib
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import requests
import base64
from datetime import datetime, timedelta
import tweepy  # Import Tweepy for Twitter API interaction

# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the trained models and other necessary objects
model = joblib.load('best_rf_model.pkl')
word2vec_model = Word2Vec.load('word2vec_model.pkl')
label_encoder = joblib.load('label_encoder.pkl')

# Load and encode the background image
def get_base64_image(image_path):
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode()
    return encoded_string

bg_image = get_base64_image('natural.png')  # Replace with actual image path

# Custom CSS
st.markdown(
    f"""
    <style>
    .stApp {{
        background-image: url("data:image/jpeg;base64,{bg_image}");
        background-size: cover;
    }}

    .stButton>button {{
        background-color: #071952;
        color: white;
        padding: 10px 24px;
        font-size: 16px;
        border: none;
        border-radius: 5px;
    }}
    .stTextInput>div>div>input {{
        padding: 5px;
        font-size: 16px;
    }}
    .disaster {{
        color: white;
        background-color: red;
        padding: 10px;
        border-radius: 5px;
    }}
    .non-disaster {{
        color: white;
        background-color: green;
        padding: 10px;
        border-radius: 5px;
    }}
    </style>
    """,
    unsafe_allow_html=True
)

# Preprocess the input text
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove mentions and hashtags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove punctuation and numbers
    tokens = nltk.word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return tokens

def get_word2vec_embeddings(tokens):
    valid_tokens = [word for word in tokens if word in word2vec_model.wv]
    if valid_tokens:
        embedding = np.mean([word2vec_model.wv[word] for word in valid_tokens], axis=0)
    else:
        embedding = np.zeros(100)
    return embedding

# Function to fetch tweet text from URL
def fetch_tweet_text_from_url(url, bearer_token):
    tweet_id = url.split('/')[-1]
    tweet_url = f"https://x.com/mrpxssy/status/{tweet_id}"
    headers = {
        'Authorization': f'Bearer {bearer_token}',
    }
    try:
        response = requests.get(tweet_url, headers=headers)
        response.raise_for_status()  # Raise error for non-200 status codes
        tweet = response.json()
        return tweet['data']['text']
    except requests.exceptions.HTTPError as http_err:
        st.error(f"HTTP error occurred: {http_err}")
    except Exception as err:
        st.error(f"Error fetching content from URL: {err}")
    return None

# Function to fetch recent disaster-related tweets
def fetch_recent_disaster_tweets():
    auth = tweepy.AppAuthHandler('KAMf7yXekENs9em5Z8pkltIMB', 'K0EVVBACVJO3moAe5ILNwwxDDLMPM1UN2slRQJKCCHXFnyH7Hg')  # Replace with your actual consumer key and secret
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

    tweets = []
    disaster_keywords = ['disaster', 'emergency', 'earthquake', 'flood', 'wildfire']  # Adjust as needed

    for keyword in disaster_keywords:
        for tweet in tweepy.Cursor(api.search, q=keyword, lang="en", tweet_mode="extended").items(10):
            if (datetime.now() - tweet.created_at) < timedelta(hours=4):
                tweets.append(tweet.full_text)
    return tweets

# Streamlit app
def main():
    st.markdown("<div class='main'>", unsafe_allow_html=True)
    st.title("Disaster Tweet Classifier")
    st.write("Enter a tweet or a URL leading to a tweet to predict if it's a disaster tweet or not.")

    tweet_or_url = st.text_area("Enter Tweet Text or URL")

    if st.button("Fetch & Classify"):
        if tweet_or_url.startswith("http"):
            tweet_text = fetch_tweet_text_from_url(tweet_or_url, 'AAAAAAAAAAAAAAAAAAAAAIzPugEAAAAA8kO%2FbxNWLXP6H8xJI80%2BNIWbMac%3DkKB4ROYsPOEXMltPFOiXMriagrlTcEno34VLzEKs8yfMQSHsVq')  # Replace with your actual bearer token
            if tweet_text:
                st.write(f"Tweet: {tweet_text}")
                cleaned_text = preprocess_text(tweet_text)
            else:
                st.error("Failed to fetch content from URL.")
                return
        else:
            tweet_text = tweet_or_url
            st.write(f"Tweet: {tweet_text}")
            cleaned_text = preprocess_text(tweet_or_url)

        embedding = get_word2vec_embeddings(cleaned_text).reshape(1, -1)
        prediction = model.predict(embedding)
        prediction_l = label_encoder.inverse_transform(prediction)[0]

        if prediction_l == 1:
            st.markdown(f"<h3 class='disaster'>Prediction: Disaster Tweet</h3>", unsafe_allow_html=True)
        else:
            st.markdown(f"<h3  class='non-disaster'>Prediction: Not a Disaster Tweet</h3>", unsafe_allow_html=True)

    if st.button("Fetch Recent Disaster Tweets"):
        st.subheader("Recent Disaster Tweets (last 4 hours)")
        recent_tweets = fetch_recent_disaster_tweets()
        for tweet in recent_tweets:
            st.write(tweet)
            st.write("---")

    st.markdown("</div>", unsafe_allow_html=True)

if __name__ == '__main__':
    main()


Overwriting app.py


In [92]:
from pyngrok import ngrok
import subprocess
import time

# Set up your ngrok authentication token
ngrok.set_auth_token('2ix585YsTPlv9xecr2xDI4bvL9W_7w44VF52rSSHdtB4zTKTQ')  # Replace with your actual ngrok auth token

# Kill any existing ngrok tunnels
ngrok.kill()

# Run the Streamlit app
streamlit_proc = subprocess.Popen(['streamlit', 'run', 'app.py', '--server.port', '8501'])

# Wait a few seconds for the app to start
time.sleep(20)  # Increase the sleep time if necessary

# Create a tunnel to the Streamlit port 8501
public_url = ngrok.connect(8501)
print('Public URL:', public_url)

# Keep the tunnel open
try:
    streamlit_proc.communicate()
except KeyboardInterrupt:
    streamlit_proc.terminate()
    ngrok.kill()



Public URL: NgrokTunnel: "https://fe6e-34-121-49-203.ngrok-free.app" -> "http://localhost:8501"
