In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')



In [None]:
df= pd.read_csv('disaster_tweets_data(DS).csv')

In [None]:
print(df.head())
print(df.info())

In [None]:
df.dropna(inplace=True)
print(df)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df['tokens'] = df['tweets'].apply(word_tokenize)
print(df)

In [None]:
df['tweets'] = df['tweets'].str.lower()
print(df)


In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Apply the function to all columns in the DataFrame
df = df.applymap(lambda x: remove_punctuation(x) if isinstance(x, str) else x)

print(df)

In [None]:
stemmer = PorterStemmer()

# Function to apply stemming to each word in a text
def apply_stemming(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

# Apply stemming to the 'text' column
df['stemmed_text'] = df['text'].apply(apply_stemming)

print(df)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(df)

# Get the feature names (i.e., the unique tokens)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame
df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Display the resulting DataFrame
print(df)


In [None]:
le = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

In [None]:
# Select x (independent variable) and y (dependent variable) â€“ class (good or bad)
X = df.drop('tweets', axis=1)
y = df['tweets']

# Split data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN Classification': KNeighborsClassifier(),
    'SVM Classifier (Linear Kernel)': SVC(kernel='linear'),
    'SVM Classifier (RBF Kernel)': SVC(kernel='rbf')
}

In [None]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Predict the class for test data
    print(f"Model: {model_name}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\n")

# Report the model with the best accuracy
best_model = max(models, key=lambda x: accuracy_score(y_test, models[x].predict(X_test)))
print(f"Best Model: {best_model}")
