In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [5]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [6]:
# Explore the data
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
# Split the data into features and target
X = train_df['text']
y = train_df['target']

In [8]:
# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Create a pipeline for preprocessing and model training
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

In [10]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [21]:
# Display pipeline and transformers/estimators
pipeline_steps = [step[1] for step in pipeline.steps]
pipeline_names = [step[0] for step in pipeline.steps]
pipeline_df = pd.DataFrame({'Pipeline': pipeline_names, 'Transformer/Estimator': pipeline_steps})
print("Pipeline:")
print(pipeline_df)

Pipeline:
     Pipeline Transformer/Estimator
0       tfidf     TfidfVectorizer()
1  classifier  LogisticRegression()


In [12]:
# Evaluate the model on the validation set
y_val_pred = pipeline.predict(X_val)

In [13]:
# Calculate classification report
report = classification_report(y_val, y_val_pred, target_names=['Not Disaster', 'Disaster'])
print(report)

              precision    recall  f1-score   support

Not Disaster       0.80      0.86      0.83       874
    Disaster       0.79      0.71      0.75       649

    accuracy                           0.80      1523
   macro avg       0.79      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523



In [14]:
# Calculate confusion matrix
cm = confusion_matrix(y_val, y_val_pred)
cm_df = pd.DataFrame(cm, index=['Not Disaster', 'Disaster'], columns=['Not Disaster', 'Disaster'])
print("Confusion Matrix:")
print(cm_df)

Confusion Matrix:
              Not Disaster  Disaster
Not Disaster           750       124
Disaster               187       462


In [15]:
# Calculate precision, recall, F1-score, and support
precision = cm.diagonal() / cm.sum(axis=0)
recall = cm.diagonal() / cm.sum(axis=1)
f1_score = 2 * (precision * recall) / (precision + recall)
support = cm.sum(axis=1)

In [16]:
# Create dataframe for results
results_df = pd.DataFrame({'Precision': precision, 'Recall': recall, 'F1-score': f1_score, 'Support': support},
                          index=['Not Disaster', 'Disaster'])
results_df.index.name = 'Class'
print("Results:")
print(results_df)

Results:
              Precision    Recall  F1-score  Support
Class                                               
Not Disaster   0.800427  0.858124  0.828272      874
Disaster       0.788396  0.711864  0.748178      649


In [17]:
# Apply the pipeline on the test data
X_test = test_df['text']
y_test_pred = pipeline.predict(X_test)

In [18]:
# Prepare submission file
submission = sample_submission.copy()
submission['target'] = y_test_pred

In [19]:
# Display submission dataframe head
print("Submission:")
print(submission.head())

Submission:
   id  target
0   0       1
1   2       0
2   3       1
3   9       0
4  11       1
