In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
from sklearn.svm import SVC
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')


  from pandas.core import (
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# Load the dataset
df = pd.read_csv('train.csv')

# Text preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove special characters and numbers
    text = re.sub(r'\W+', ' ', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    text = ' '.join([word for word in tokens if word not in stop_words])
    # Stemming
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)


In [3]:
wordnet_lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    output = ''
    words = word_tokenize(text)
    for i in words:
        i = wordnet_lemmatizer.lemmatize(i)
        
        if i not in stop_words:
            output += i.lower() + ' '
    
    return output

def extract_ngrams(text, n):
#     words = preprocess_text(text)
    n_grams = ngrams(text, n)
    return list(n_grams)


analyser = SentimentIntensityAnalyzer()
def analyse_sentiments(text):
    sentiment_score = analyser.polarity_scores(text)
    return sentiment_score['compound']
    

In [4]:
# Bag of Words (BoW) feature extraction
bow_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_bow = bow_vectorizer.fit_transform(df['cleaned_text'])

# TF-IDF feature extraction
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vectorizer.fit_transform(df['cleaned_text'])


In [5]:
from textblob import TextBlob

# Function to get sentiment
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply sentiment analysis
df['sentiment'] = df['text'].apply(get_sentiment)


In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['target'], test_size=0.2, random_state=42)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Define individual models with basic parameters
model1 = LogisticRegression(max_iter=200)
model2 = MultinomialNB()
model3 = SVC(probability=True)  # probability=True is required for soft voting


In [8]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ('lr', model1), 
    ('nb', model2), 
    ('svc', model3)
], voting='soft')  # 'soft' voting allows for probability averaging
param_grid = {
    'lr__C': [0.01, 0.1, 1, 10],  # Hyperparameter for Logistic Regression
    'svc__C': [0.1, 1, 10, 100],  # Hyperparameter for SVC
    'svc__kernel': ['linear', 'rbf'],  # Hyperparameter for SVC
    'nb__alpha': [0.01, 0.1, 1, 10],  # Hyperparameter for Naive Bayes
    'voting': ['soft', 'hard']  # Whether to use soft or hard voting
}

In [9]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(estimator=voting_clf, param_distributions=param_grid, 
                                   n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters and score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy: ", random_search.best_score_)


Best parameters found:  {'voting': 'soft', 'svc__kernel': 'rbf', 'svc__C': 100, 'nb__alpha': 0.1, 'lr__C': 0.01}
Best accuracy:  0.8045977011494253


In [10]:
X_test = pd.read_csv('test.csv')

In [11]:
import pandas as pd

# Load the test set
test_df = pd.read_csv('test.csv')

# Check the first few rows of the test data to confirm it's loaded correctly
print(test_df.head())


   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming you used TfidfVectorizer or CountVectorizer for training
# If you are still in the same session, ensure that the vectorizer from training is defined:
# vectorizer = TfidfVectorizer(...) or similar from training

# Check if vectorizer is still in memory, otherwise you'll need to redefine or reload it
X_test = vectorizer.transform(test_df['text'])  # Transform test data with the existing vectorizer

print(f"Test set shape after vectorization: {X_test.shape}")

# import joblib

# # Load the vectorizer (if you saved it previously)
# vectorizer = joblib.load('vectorizer.pkl')

# Transform the test data
X_test = vectorizer.transform(test_df['text'])



Test set shape after vectorization: (3263, 60991)


In [13]:
# Assuming `random_search` has already been fit to your training data

# Make predictions on the test set
predictions = random_search.best_estimator_.predict(X_test)

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'id': test_df['id'],  # Assuming 'id' is the column name for tweet ids in your test dataset
    'target': predictions  # These are your predictions for whether each tweet is a disaster or not
})

# Save the submission to a CSV file
submission_df.to_csv('disaster_tweets_submission.csv', index=False)
