In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv',usecols=[0,3,4])
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv',usecols=[0,3])

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
for i in train.columns:
    print(i, train[i].isnull().sum()) 

In [None]:
for i in test.columns:
    print(i, test[i].isnull().sum())

In [None]:
x=train['text']
y=train['target']

In [None]:
def clean_text(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

In [None]:
x=x.apply(lambda x: clean_text(x))
test['text']=test['text'].apply(lambda x: clean_text(x))

In [None]:
stop = stopwords.words('english')
x=x.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
test['text']=test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
plt.figure(figsize=(8, 6))
train['target'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of Target Variable')
plt.xlabel('Target')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Non-Disaster', 'Disaster'], rotation=0)
plt.show()

In [None]:
train['tweet_length'] = train['text'].apply(len)
plt.figure(figsize=(8, 6))
plt.hist(train[train['target'] == 0]['tweet_length'], bins=20, color='skyblue', alpha=0.7, label='Non-Disaster')
plt.hist(train[train['target'] == 1]['tweet_length'], bins=20, color='salmon', alpha=0.7, label='Disaster')
plt.title('Distribution of Tweet Lengths')
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
disaster_tweets = train[train['target'] == 1]['text'].values
non_disaster_tweets = train[train['target'] == 0]['text'].values

In [None]:
plt.figure(figsize=(10, 8))
wordcloud_disaster = WordCloud(width=800, height=800, background_color='black').generate(" ".join(disaster_tweets))
plt.imshow(wordcloud_disaster, interpolation='bilinear')
plt.title('Word Cloud for Disaster Tweets')
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
wordcloud_non_disaster = WordCloud(width=800, height=800, background_color='white').generate(" ".join(non_disaster_tweets))
plt.imshow(wordcloud_non_disaster, interpolation='bilinear')
plt.title('Word Cloud for Non-Disaster Tweets')
plt.axis('off')
plt.show()

In [None]:
fig, (ax1) = plt.subplots(1, figsize=[6, 6])
wordcloud = WordCloud(background_color='black',width=600,height=600).generate(" ".join(x))
ax1.imshow(wordcloud)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,shuffle=True,stratify=y)

In [None]:
c=Pipeline([('count',CountVectorizer(ngram_range=(1,1))),('model',LogisticRegression(C=.8,solver='sag',max_iter=1000))])
c.fit(x_train,y_train)

In [None]:
c.score(x_train,y_train)

In [None]:
c.score(x_test,y_test)

In [None]:
y_pred=c.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
c=Pipeline([('tfidf',TfidfVectorizer(ngram_range=(1,1))),('model',LogisticRegression(C=.8,solver='sag',max_iter=1000))])
c.fit(x_train,y_train)

In [None]:
print(c.score(x_train,y_train))
print(c.score(x_test,y_test))

In [None]:
y_pred_tfidf=c.predict(x_test)
print(confusion_matrix(y_test,y_pred_tfidf))
print(classification_report(y_test,y_pred_tfidf))

In [None]:
submission=pd.DataFrame()
submission['id']=test['id']
submission['target']=c.predict(test['text'])
submission.to_csv('submission.csv',index=False)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Generate predictions for CountVectorizer and TF-IDF models
y_pred_cv = c.predict(x_test)
y_pred_tfidf = c_tfidf.predict(x_test)

# Generate classification reports for CountVectorizer and TF-IDF models
classification_report_cv = classification_report(y_test, y_pred_cv)
classification_report_tfidf = classification_report(y_test, y_pred_tfidf)

# Generate confusion matrices for CountVectorizer and TF-IDF models
confusion_matrix_cv = confusion_matrix(y_test, y_pred_cv)
confusion_matrix_tfidf = confusion_matrix(y_test, y_pred_tfidf)

# Plotting the performance metrics for CountVectorizer model
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_cv, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.title('Confusion Matrix for CountVectorizer Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plotting the performance metrics for TF-IDF model
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_tfidf, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.title('Confusion Matrix for TF-IDF Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()