In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download NLTK stopwords if you haven't already
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Load the dataset
df = pd.read_csv('usuario_16052.csv')

# View the first few rows of the dataset
print(df.head())

# Text preprocessing function
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove hashtags, mentions, and special characters
    text = re.sub(r'@\w+|#\w+|[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenization and remove stopwords
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the 'Tweet' column
df['Cleaned_Tweet'] = df['Tweet'].apply(preprocess_text)

# Check the cleaned data
print(df[['Tweet', 'Cleaned_Tweet']].head())

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Use 5000 most frequent words
X = tfidf_vectorizer.fit_transform(df['Cleaned_Tweet'])

# Labels (dependent variable) - "Cause" column
y = df['Cause']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


[nltk_data] Downloading package stopwords to C:\Users\Ayushi
[nltk_data]     thakur\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


        class             tweet_id         day      time  \
0  DEPRESSION  1552203241337012224  2022-07-27  08:04:25   
1  DEPRESSION  1552202927481454592  2022-07-27  08:03:11   
2  DEPRESSION  1552200680470765573  2022-07-27  07:54:15   
3  DEPRESSION  1552200578574430208  2022-07-27  07:53:51   
4  DEPRESSION  1552198421922160640  2022-07-27  07:45:16   

                                               tweet  tweet_favorite_count  \
0    "WHAT-- 😱 yes please. Make this happen HTTPURL"                     0   
1  "@USER Then it's okay. We can hope that they w...                     1   
2                         "@USER Embarrassed as in?"                     0   
3  "Hello @USER @USER  I've been following @USER ...                     0   
4  "@USER @USER @USER @USER PLEASE I WILL DO ANYT...                     5   

   tweet_retweet_count         tweet_source              user_id  \
0                    0  Twitter for Android  1169633033957150721   
1                    0  Twitter fo

KeyError: 'Tweet'

In [7]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Load your dataset (assuming it's in CSV format)
df = pd.read_csv('usuario_16035.csv')

# Display the first few rows of the dataset to ensure it's loaded correctly


# Preprocessing function to clean the text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove mentions and hashtags
    text = re.sub(r'\@\w+|\#','', text)
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing to the 'tweet' column
df['Cleaned_Tweet'] = df['tweet'].apply(preprocess_text)

# Check the cleaned data
print(df[['tweet', 'Cleaned_Tweet']].head())

# TF-IDF Vectorization of Cleaned Tweet
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['Cleaned_Tweet']).toarray()

# Assuming 'class' column is the target label
y = df['class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use Naive Bayes for classification
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Optional: Save the vectorizer and model for future use (e.g., in a Flask app)
import pickle
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)


[nltk_data] Downloading package stopwords to C:\Users\Ayushi
[nltk_data]     thakur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Ayushi
[nltk_data]     thakur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'usuario_16035.csv'