In [None]:
!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d kazanova/sentiment140
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('Done')
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import pandas as pd

# Load the dataset
columns = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin1', header=None, names=columns)

print(df.head())
# Clean text
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the punkt_tab data package
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(clean_text)
print(df[['text', 'cleaned_text']].head())
import matplotlib.pyplot as plt  # Import matplotlib.pyplot
import seaborn as sns  # Import seaborn

df['sentiment'] = df['target'].map({0: 0, 4: 1})

# Visualization
plt.figure(figsize=(6, 4))
sns.countplot(x='sentiment', data=df)
plt.title("Distribution of Encoded Sentiment Labels")
plt.show()

label_encoder = LabelEncoder()
# LabelEncoder is typically used for categorical data with more than two classes.
# Since we are already handling it above with mapping for binary classes it might not be needed
# Using labelEncoder just for demonstration here.

#df['sentiment'] = label_encoder.fit_transform(df['sentiment'])  # Now, this should work
import matplotlib.pyplot as plt  # Import matplotlib.pyplot
import seaborn as sns  # Import seaborn

df['sentiment'] = df['target'].map({0: 0, 4: 1})

# Visualization
plt.figure(figsize=(6, 4))
sns.countplot(x='sentiment', data=df)
plt.title("Distribution of Encoded Sentiment Labels")
plt.show()

label_encoder = LabelEncoder()
# LabelEncoder is typically used for categorical data with more than two classes.
# Since we are already handling it above with mapping for binary classes it might not be needed
# Using labelEncoder just for demonstration here.

#df['sentiment'] = label_encoder.fit_transform(df['sentiment'])  # Now, this should work
# Step 5: Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Visualization: Shape of vectorized data
print("\nShape of Vectorized Data:")
print(f"X_train_vec shape: {X_train_vec.shape}")
print(f"X_test_vec shape: {X_test_vec.shape}")
# Step 6: Train model
model = LogisticRegression()
model.fit(X_train_vec, y_train)
# Step 7: Evaluate model
y_pred = model.predict(X_test_vec)
# Step 8: Save model
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("\nModel and vectorizer saved successfully!")

# Visualization: Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
# Since you're dealing with binary classification (0 and 1), manually set the labels:
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1], yticklabels=[0, 1])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Visualization: Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['0', '1'])) # Manually provide target names


from google.colab import files
files.download('sentiment_model.pkl')
files.download('tfidf_vectorizer.pkl')