In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
display(train.head())
train.info()

In [None]:
display(test.head())
test.info()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train['text_length'] = train['text'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(train['text_length'], bins=30, kde=True, color='blue')
plt.title('Distribution of Text Lengths in Training Set')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=train)
plt.title('Distribution of Target Variable')
plt.show()

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(train['text'])
train_text_features = tfidf_vectorizer.transform(train['text'])
test_text_features = tfidf_vectorizer.transform(test['text'])


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(x='text_length', hue='target', data=train, bins=30, kde=True, multiple='stack', palette='viridis')
plt.title('Distribution of Text Lengths by Target Class')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.legend(title='Target', loc='upper right', labels=['Not Disaster', 'Disaster'])
plt.show()

In [None]:
from wordcloud import WordCloud

text_combined = " ".join(train['text'])

plt.figure(figsize=(12, 8))
wordcloud = WordCloud(width=800, height=400, random_state=42, background_color='white').generate(text_combined)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Most Common Words in Training Set')
plt.show()

In [None]:
label_encoder = preprocessing.LabelEncoder()
train_target = label_encoder.fit_transform(train['target'])
X_train, X_val, y_train, y_val = train_test_split(train_text_features, train_target, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
val_predictions = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_predictions))

In [None]:
test_predictions = model.predict(test_text_features)

In [None]:
submission = pd.DataFrame({'id': test['id'], 'target': test_predictions})
submission.to_csv('s.csv', index=False)