In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
df = pd.read_csv('train.csv')

# Define a function for data processing (preprocessing, removing stop words, and lemmatization)
def data_processing(text):
    text = text.lower()
    text = re.sub(r"https\S+|www\S+https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text_tokens = word_tokenize(text)

    # Define a lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Apply lemmatization and remove stop words
    processed_text = [lemmatizer.lemmatize(word) for word in text_tokens if word not in stopwords.words('english')]

    return " ".join(processed_text)

# Apply data processing to your dataset
df['processed_text'] = df['text'].apply(data_processing)

X = df['processed_text']  # We will use the stemmed text for further processing
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('vader_lexicon')
nltk.download('wordnet')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function to get sentiment scores
def get_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']  # We'll use the compound score as the sentiment feature

# Function to train and evaluate the LSTM model with predefined disaster keywords
def train_and_evaluate_lstm(predefined_disasters=None):
    # Apply data processing and sentiment analysis to your dataset
    df['processed_text'] = df['text'].apply(data_processing)
    df['sentiment'] = df['text'].apply(get_sentiment)

    # Split the data into training and testing sets
    X = df['processed_text']
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define and compile the LSTM model
    lstm_model = keras.Sequential([
        Embedding(input_dim=10000, output_dim=128, input_length=50),
        LSTM(128),
        Dense(1, activation='sigmoid')
    ])
    lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Tokenize and pad the text data
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(X_train)
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
    X_train_padded = pad_sequences(X_train_seq, maxlen=50)
    X_test_padded = pad_sequences(X_test_seq, maxlen=50)

    # Train the LSTM model
    lstm_model.fit(X_train_padded, y_train, epochs=5, batch_size=64, verbose=1)

    # Save the tokenizer with the word index
    tokenizer_path = 'tokenizer.pkl'
    joblib.dump(tokenizer, tokenizer_path)

    # Evaluate the LSTM model
    accuracy = lstm_model.evaluate(X_test_padded, y_test, verbose=0)
    print("LSTM Model Accuracy:", accuracy[1])  # Accuracy is the second element of the evaluation result
train_and_evaluate_lstm()


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM Model Accuracy: 0.7695338129997253


In [None]:
# Save the trained LSTM model to a file
lstm_model.save('lstm_model.h5')


  saving_api.save_model(


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function to get sentiment scores
def get_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']  # We'll use the compound score as the sentiment feature

# Function to train and evaluate a Naive Bayes model with predefined disaster keywords
def train_and_evaluate_naive_bayes(df, predefined_disasters=None):
    # Apply data processing and sentiment analysis to your dataset
    df['processed_text'] = df['text'].apply(data_processing)
    df['sentiment'] = df['text'].apply(get_sentiment)

    # Combine predefined disaster keywords with the processed text
    if predefined_disasters:
        df['text'] = df['text'] + ' ' + df['processed_text'].apply(lambda text: ' '.join([word for word in text.split() if word in predefined_disasters]))

    # Split the data into training and testing sets
    X = df['text']
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))

    # Transform the text data into TF-IDF features
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    # Initialize and train a Naive Bayes model
    nb_model = MultinomialNB()
    nb_model.fit(X_train_tfidf, y_train)

    # Make predictions
    nb_pred = nb_model.predict(X_test_tfidf)

    # Evaluate the Naive Bayes model
    accuracy_nb = accuracy_score(y_test, nb_pred)
    classification_rep_nb = classification_report(y_test, nb_pred)

    print("Naive Bayes Model Accuracy:", accuracy_nb)
    print("Naive Bayes Model Classification Report:\n", classification_rep_nb)

# Call the function to train and evaluate the Naive Bayes model
train_and_evaluate_naive_bayes(df, predefined_disasters=["earthquake", "flood", "fire", "hurricane"])


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Naive Bayes Model Accuracy: 0.7964543663821405
Naive Bayes Model Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.91      0.84       874
           1       0.84      0.65      0.73       649

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.78      1523
weighted avg       0.80      0.80      0.79      1523



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function to get sentiment scores
def get_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']  # We'll use the compound score as the sentiment feature

# Function to train and evaluate an SVM model with predefined disaster keywords
def train_and_evaluate_svm(df, predefined_disasters=None):
    # Apply data processing and sentiment analysis to your dataset
    df['processed_text'] = df['text'].apply(data_processing)
    df['sentiment'] = df['text'].apply(get_sentiment)

    # Combine predefined disaster keywords with the processed text
    if predefined_disasters:
        df['text'] = df['text'] + ' ' + df['processed_text'].apply(lambda text: ' '.join([word for word in text.split() if word in predefined_disasters]))

    # Split the data into training and testing sets
    X = df['text']
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))

    # Transform the text data into TF-IDF features
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    # Initialize and train an SVM model
    svm_model = SVC(kernel='linear')
    svm_model.fit(X_train_tfidf, y_train)

    # Make predictions
    svm_pred = svm_model.predict(X_test_tfidf)

    # Evaluate the SVM model
    accuracy_svm = accuracy_score(y_test, svm_pred)
    classification_rep_svm = classification_report(y_test, svm_pred)

    print("SVM Model Accuracy:", accuracy_svm)
    print("SVM Model Classification Report:\n", classification_rep_svm)

# Call the function to train and evaluate the SVM model
train_and_evaluate_svm(df, predefined_disasters=["earthquake", "flood", "fire", "hurricane"])


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


SVM Model Accuracy: 0.8049901510177282
SVM Model Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.87      0.84       874
           1       0.80      0.72      0.76       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.80      1523
weighted avg       0.80      0.80      0.80      1523



In [None]:
import joblib

# Save the Naive Bayes model
joblib.dump(nb_model, 'naive_bayes_model.pkl')

# Save the SVM model
joblib.dump(svm_model, 'svm_model.pkl')


['svm_model.pkl']

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import joblib
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the saved tokenizer
tokenizer_path = 'tokenizer.pkl'
with open(tokenizer_path, 'rb') as tokenizer_file:
    tokenizer = joblib.load(tokenizer_file)

# Load the saved model
lstm_model = load_model('lstm_model.h5')
naive_bayes_model = joblib.load('naive_bayes_model.pkl')
svm_model = joblib.load('svm_model.pkl')

# Preprocess input comment
def data_processing(text):
    text = text.lower()
    text = re.sub(r"https\S+|www\S+https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text_tokens = word_tokenize(text)

    lemmatizer = WordNetLemmatizer()

    processed_text = [lemmatizer.lemmatize(word) for word in text_tokens if word not in stopwords.words('english')]

    return " ".join(processed_text)

input_comment = "aeroplane turbulence"
preprocessed_input = data_processing(input_comment)

# Use each model to make individual predictions
input_seq = tokenizer.texts_to_sequences([preprocessed_input])
input_padded = pad_sequences(input_seq, maxlen=50)

lstm_prediction = lstm_model.predict(input_padded)
naive_bayes_prediction = naive_bayes_model.predict([preprocessed_input])[0]
svm_prediction = svm_model.predict([preprocessed_input])[0]

# Display individual predictions
print("LSTM Model Prediction:", lstm_prediction)
print("Naive Bayes Model Prediction:", naive_bayes_prediction)
print("SVM Model Prediction:", svm_prediction)


LSTM Model Prediction: [[0.17883675]]
Naive Bayes Model Prediction: 0
SVM Model Prediction: 0
