# Importing Libraries

In [1]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC

# Download required NLTK resources

In [2]:
import warnings
warnings.filterwarnings('default')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vraje\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vraje\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
twit_data = pd.read_csv('Twitter_Data.csv')

In [4]:
twit_data = twit_data.dropna(axis=0)

In [5]:
twit_data['category'].unique()

array([-1.,  0.,  1.])

# Preprocessing function

In [6]:
def preprocess_text(text, label):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    text = ' '.join(filtered_words)
    
    # Perform stemming or lemmatization (using stemming in this example)
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    text = ' '.join(stemmed_words)
    
    # Convert label to string format
    if label == 1.0:
        label = "positive"
    elif label == 0.0:
        label = "neutral"
    else:
        label = "negative"
    
    return text, label

# Function to train and test the model

In [7]:
def train_test_model(data, labels):
    # Preprocess the data
    preprocessed_data = [preprocess_text(text, label) for text, label in zip(data, labels)]
    preprocessed_texts, preprocessed_labels = zip(*preprocessed_data)
    
    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(preprocessed_texts, preprocessed_labels, test_size=0.2, random_state=42)
    
    # Vectorize the text data
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    # Train the model (using LinearSVC in this example)
    model = LinearSVC()
    model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return model, vectorizer, accuracy, precision, recall, f1

# Function to test a new piece of text

In [8]:
def test_new_text(text, model, vectorizer):
    # Preprocess the text
    preprocessed_text, _ = preprocess_text(text, None)
    
    # Vectorize the preprocessed text
    text_vector = vectorizer.transform([preprocessed_text])
    
    # Predict the sentiment
    sentiment = model.predict(text_vector)[0]
    
    return sentiment

# Train and test the model

In [9]:
model, vectorizer, accuracy, precision, recall, f1 = train_test_model(twit_data['clean_text'], twit_data['category'])

# Print the evaluation metrics

In [10]:
print("Model Evaluation:")
print("Accuracy:", round(accuracy*100, 2), "%")
print("Precision:", round(precision*100, 2), "%")
print("Recall:", round(recall*100, 2),"%")
print("F1-score:", round(f1*100, 2), "%")

Model Evaluation:
Accuracy: 84.82 %
Precision: 84.86 %
Recall: 84.82 %
F1-score: 84.74 %


# Test a new piece of text and Print the sentiment result

In [11]:
new_text = "I'm really happy with the service."
sentiment = test_new_text(new_text, model, vectorizer)

print("Sentiment:", sentiment)

Sentiment: positive
