In [None]:
pip install pandas numpy nltk scikit-learn gensim tqdm



In [None]:
pip install pandas numpy nltk




Load the Dataset

In [None]:
import pandas as pd

# Load your dataset
file_path = 'tweet.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(df.head())


   id keyword        location  \
0   1  ablaze             NaN   
1   2  ablaze             NaN   
2   3  ablaze   New York City   
3   4  ablaze  Morgantown, WV   
4   5  ablaze             NaN   

                                                text  target  
0  Communal violence in Bhainsa, Telangana. "Ston...       1  
1  Telangana: Section 144 has been imposed in Bha...       1  
2  Arsonist sets cars ablaze at dealership https:...       1  
3  Arsonist sets cars ablaze at dealership https:...       1  
4  "Lord Jesus, your love brings freedom and pard...       0  


Pre-processing and Cleaning of Dataset


In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords and wordnet data
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset
file_path = 'tweet.csv'  # Replace with your file name
df = pd.read_csv(file_path)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@w+|\#','', text)  # Remove mentions and hashtags
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)  # Remove special characters
    text = text.strip()  # Remove leading/trailing whitespace
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatize words
    return text

# Apply preprocessing to the dataset
df['clean_text'] = df['text'].apply(preprocess_text)

# Remove the original 'text' column
df.drop(columns=['text'], inplace=True)

# Save cleaned dataset to a new CSV file with the specified name
cleaned_file_name = 'cleaned_dataset.csv'
df.to_csv(cleaned_file_name, index=False)

print(f"Cleaned dataset saved as {cleaned_file_name}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Cleaned dataset saved as cleaned_dataset.csv


In [None]:
print(df.head(2000))

        id  keyword        location  target  \
0        1   ablaze             NaN       1   
1        2   ablaze             NaN       1   
2        3   ablaze   New York City       1   
3        4   ablaze  Morgantown, WV       1   
4        5   ablaze             NaN       0   
...    ...      ...             ...     ...   
1995  1996  wounded             NaN       1   
1996  1997  wounded             NaN       1   
1997  1998  wounded             NaN       0   
1998  1999  wounded             NaN       1   
1999  2000  wounded             NaN       1   

                                             clean_text  
0     communal violence bhainsa telangana stone pelt...  
1     telangana section 144 imposed bhainsa january ...  
2                    arsonist set car ablaze dealership  
3                    arsonist set car ablaze dealership  
4     lord jesus love brings freedom pardon fill hol...  
...                                                 ...  
1995  almost 1500 innocent ir

Convert Text Data into Numerical Representations using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import pickle

# Load the cleaned dataset
cleaned_file_path = 'cleaned_dataset.csv'
clean_df = pd.read_csv(cleaned_file_path)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(clean_df['clean_text'])

# Save the TF-IDF features
with open('tfidf_features.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

# Assuming 'target' is the column with labels
# Replace 'target' with the actual column name if different
y = clean_df['target']


In [None]:
# Display TF-IDF features
print("TF-IDF Features:")
print(X_tfidf)


TF-IDF Features:
  (0, 82)	0.265232073123263
  (0, 3462)	0.2364878885772171
  (0, 4633)	0.2543520651716344
  (0, 1714)	0.44808564694705205
  (0, 2150)	0.265232073123263
  (0, 2463)	0.30322981823930834
  (0, 3881)	0.2744856336932624
  (0, 4135)	0.28641554816606285
  (0, 314)	0.30322981823930834
  (0, 4674)	0.293976257669309
  (0, 767)	0.3319740027853543
  (1, 2607)	0.2994662462713562
  (1, 9)	0.23243951450258682
  (1, 1605)	0.22667299480716893
  (1, 4473)	0.19239608757586613
  (1, 1330)	0.26518933904005343
  (1, 665)	0.27353676692685114
  (1, 13)	0.252602474151674
  (1, 11)	0.24320122076733267
  (1, 1825)	0.443355616475682
  (1, 1754)	0.28429847319159696
  (1, 3399)	0.28429847319159696
  (1, 4135)	0.2583689938470919
  (1, 314)	0.27353676692685114
  (2, 1043)	0.5245743477055231
  :	:
  (2201, 819)	0.31996582234107573
  (2201, 1489)	0.2907832977632908
  (2201, 2151)	0.2380401183295643
  (2201, 4578)	0.31996582234107573
  (2201, 4432)	0.2190108741050266
  (2201, 4079)	0.2230235740663911
  

 Explore and Integrate Word Embedding Techniques using Word2Vec

In [None]:
from gensim.models import Word2Vec
import numpy as np # Import numpy

# Prepare the dataset for Word2Vec
sentences = [row.split() for row in clean_df['clean_text']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save the Word2Vec model
word2vec_model.save("word2vec_model.model")

# Create Word2Vec feature vectors
def get_word2vec_vector(text):
    words = text.split()
    vector = np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv], axis=0)
    return vector

X_word2vec = np.array([get_word2vec_vector(text) for text in clean_df['clean_text']])

In [None]:
from gensim.models import Word2Vec
import numpy as np

# Prepare the dataset for Word2Vec
sentences = [row.split() for row in clean_df['clean_text']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Print some information about the Word2Vec model
print("Word2Vec Model Information:")
print("Vocabulary Size:", len(word2vec_model.wv))
print("Word Vector Dimensionality:", word2vec_model.wv.vector_size)
print("Most similar words to 'disaster':", word2vec_model.wv.most_similar('disaster'))


Word2Vec Model Information:
Vocabulary Size: 7354
Word Vector Dimensionality: 100
Most similar words to 'disaster': [('foreign', 0.3486267924308777), ('5am', 0.3434354364871979), ('forc', 0.3370402753353119), ('csgogiveaway', 0.3301014006137848), ('grasp', 0.3298601508140564), ('bruh', 0.32777780294418335), ('fascination', 0.3248547911643982), ('overtopping', 0.3226398527622223), ('may', 0.32150524854660034), ('become', 0.31580862402915955)]


Select and Compare Machine Learning Algorithms
python


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Define the classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'Multinomial Naive Bayes': MultinomialNB()
}

# Train and evaluate each classifier
best_classifier = None
best_accuracy = 0

for clf_name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Test Accuracy: {accuracy:.4f}")

    # Update best classifier if necessary
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_classifier = clf_name

print(f"\nBest Classifier: {best_classifier} with Accuracy: {best_accuracy:.4f}")


Logistic Regression Test Accuracy: 0.8027
Random Forest Test Accuracy: 0.8322
Support Vector Machine Test Accuracy: 0.8186
Multinomial Naive Bayes Test Accuracy: 0.8095

Best Classifier: Random Forest with Accuracy: 0.8322


Training dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Load the preprocessed dataset
data = pd.read_csv('cleaned_dataset.csv')  # Ensure this CSV has clean_text and target columns

In [None]:
# Separate features and target variable
X = data['clean_text']  # Assuming 'clean_text' column contains preprocessed tweets
y = data['target']      # Assuming 'target' column contains the labels

In [None]:

# Split the data into training and temporary set (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Split the temporary set into validation and testing sets (50% val, 50% test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [None]:

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [None]:

# Transform the validation and test data
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Initialize the model
rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Train the model
rf_model.fit(X_train_tfidf, y_train)

In [None]:

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)


In [None]:
# Fit GridSearchCV
grid_search.fit(X_train_tfidf, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')


Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}


In [None]:

# Train the model with the best parameters
best_rf_model = grid_search.best_estimator_


In [None]:

# Predict on the validation set
y_val_pred = best_rf_model.predict(X_val_tfidf)


In [None]:
# Calculate metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred, average='weighted')
val_recall = recall_score(y_val, y_val_pred, average='weighted')


In [None]:
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation Precision: {val_precision}')
print(f'Validation Recall: {val_recall}')


Validation Accuracy: 0.8318181818181818
Validation Precision: 0.8272045454545455
Validation Recall: 0.8318181818181818


In [None]:
# Detailed classification report
print(classification_report(y_val, y_val_pred))


              precision    recall  f1-score   support

           0       0.83      0.98      0.90       171
           1       0.80      0.33      0.46        49

    accuracy                           0.83       220
   macro avg       0.82      0.65      0.68       220
weighted avg       0.83      0.83      0.80       220



In [None]:

# Predict on the test set
y_test_pred = best_rf_model.predict(X_test_tfidf)


In [None]:
# Calculate metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')

print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')


Test Accuracy: 0.8280542986425339
Test Precision: 0.8187933634992458
Test Recall: 0.8280542986425339


In [None]:
# Detailed classification report
print(classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

           0       0.83      0.97      0.90       172
           1       0.76      0.33      0.46        49

    accuracy                           0.83       221
   macro avg       0.80      0.65      0.68       221
weighted avg       0.82      0.83      0.80       221



In [None]:
# Function to classify new tweets
def classify_tweet(tweet):
    # Assuming the tweet is already preprocessed
    tweet_tfidf = tfidf_vectorizer.transform([tweet])

    # Predict using the trained model
    prediction = best_rf_model.predict(tweet_tfidf)

    return 'Disaster' if prediction == 1 else 'Not Disaster'


In [None]:
import joblib
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

In [None]:
joblib.dump(rf_model, 'random_forest_model.joblib')

['random_forest_model.joblib']

In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4

In [None]:
import streamlit as st
import joblib

# Load the trained model
model = joblib.load('tfidf_features.pkl')

# Set up the Streamlit UI
st.title("Text Classification App")
st.write("Enter some text and click the button to classify it.")

# Create a text input box
text_input = st.text_area("Enter text here:")

# Create a button to classify the text
if st.button("Classify"):
    # Perform the classification (replace with your actual logic)
    prediction = model.predict([text_input])
    confidence = model.predict_proba([text_input])

    # Display the result
    label = 'positive' if prediction[0] == 1 else 'negative'
    confidence_score = max(confidence[0])
    st.write(f"Classification: {label}")
    st.write(f"Confidence: {confidence_score:.2f}")