In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# Load the Twitter dataset into a pandas DataFrame
# df = pd.read_csv('/content/drive/MyDrive/SWM/preprocessed_dataset.csv')
# print(df.head())

# Load the preprocessed dataset
# Replace this with the actual loading of your preprocessed dataset
data = pd.read_csv('/content/drive/MyDrive/SWM/preprocessed_dataset.csv', encoding='latin-1',
                   names=['target', 'ids', 'date', 'flag', 'user', 'text'])
data = data.iloc[1:]
print(data.head())

  target         ids                          date      flag             user  \
1      0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
2      0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
3      0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
4      0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
5      0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   

                                                text  
1  switchfoot   Awww thats bummer You shoulda got...  
2  upset cant update Facebook texting it might cr...  
3  Kenichan I dived many times ball Managed save ...  
4                   whole body feels itchy like fire  
5  nationwideclass no behaving all im mad here I ...  


  data = pd.read_csv('/content/drive/MyDrive/SWM/preprocessed_dataset.csv', encoding='latin-1',


In [6]:
# Get the column names
column_names = data.columns

# Print the column names
print(column_names)

Index(['target', 'ids', 'date', 'flag', 'user', 'text'], dtype='object')


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Get the number of rows using shape attribute
#Removing empty rows
data = data.dropna()
num_rows = data.shape[0]
data['target'] = data['target'].astype(int)

# Print the number of rows
print(num_rows)

1599889


In [9]:
# Sample a fraction of the dataset (e.g., 10%)
data = data.sample(frac=0.1, random_state=42)

In [10]:
# Split the dataset into training and testing sets
X = data['text']
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Convert the text data into numeric format
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [12]:
# Train the SVM model
svm = SVC(kernel='linear', C=1, random_state=42, verbose=2)
svm.fit(X_train_vect, y_train)

[LibSVM]

In [41]:
import string
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import time
import json
import numpy as np

In [42]:
nltk.download("stopwords")
stopWords = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
def preprocess_text(text):
    def removeStopwords(text):
        tokens = []
        for token in text.split():
            if token.lower() not in stopWords:
                tokens.append(token.lower())
        return " ".join(tokens)

    def removeURL(text):
        url = re.compile(r"https?://\S+|www\.\S+")
        return url.sub(r"", text)

    def removeHTML(text):
        html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
        return re.sub(html, "", text)

    def removeSymbols(text):
        table = str.maketrans("", "", string.punctuation)
        return text.translate(table)

    preprocessed_text = removeStopwords(text)
    preprocessed_text = removeURL(preprocessed_text)
    preprocessed_text = removeHTML(preprocessed_text)
    preprocessed_text = removeSymbols(preprocessed_text)

    return preprocessed_text

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def predict_sentiment(text, model, vectorizer):
    start_time = time.time()
    
    preprocessed_text = preprocess_text(text)
    text_vect = vectorizer.transform([preprocessed_text])
    
    sentiment_prediction = model.predict(text_vect)
    raw_score = model.decision_function(text_vect)[0]
    confidence_score = sigmoid(raw_score)
    
    elapsed_time = time.time() - start_time
    
    sentiment = 'POSITIVE' if sentiment_prediction[0] == 4 else 'NEGATIVE'
    
    response = {
        'label': sentiment,
        'score': confidence_score,
        'elapsed_time': elapsed_time
    }
    
    return json.dumps(response)

In [44]:
# Test the model with a new Twitter text
test_text = "Just had the best day ever at the beach! Sun, sand, and waves. Can't wait to go back! #summerfun #beachday"
sentiment_json = predict_sentiment(test_text, svm, vectorizer)

# Print the JSON response
print(sentiment_json)

{"label": "POSITIVE", "score": 0.6136981900168118, "elapsed_time": 0.1604454517364502}


In [45]:
# Test the model with a new Twitter text
test_text = "Feeling so frustrated with this never-ending traffic. Why does it always have to be this way? #trafficwoes #ugh"
sentiment_json = predict_sentiment(test_text, svm, vectorizer)

# Print the JSON response
print(sentiment_json)

{"label": "NEGATIVE", "score": 0.06964910343834048, "elapsed_time": 0.0983426570892334}


In [46]:
# Test the model with a new Twitter text
test_text = "Just got my dream job offer! So excited to start this new chapter in my life. #dreamjob #careermove"
sentiment_json = predict_sentiment(test_text, svm, vectorizer)

# Print the JSON response
print(sentiment_json)

{"label": "POSITIVE", "score": 0.7847887549257667, "elapsed_time": 0.06306076049804688}


In [47]:
# Test the model with a new Twitter text
test_text = "The film does a good job of balancing this large cast and it’s just a fun superhero movie with a lot of heart."
sentiment_json = predict_sentiment(test_text, svm, vectorizer)

# Print the JSON response
print(sentiment_json)

{"label": "POSITIVE", "score": 0.7873562540387439, "elapsed_time": 0.053484439849853516}


In [48]:
# Test the model with a new Twitter text
test_text = "So disappointed with the customer service at this store. They were rude and unhelpful. #customerservicefail #disappointed"
sentiment_json = predict_sentiment(test_text, svm, vectorizer)

# Print the JSON response
print(sentiment_json)

{"label": "NEGATIVE", "score": 0.03726305645904526, "elapsed_time": 0.04345273971557617}


In [50]:
import pickle

# Save the trained model and vectorizer to a pickle file
with open('SVM.pkl', 'wb') as f:
    pickle.dump((svm, vectorizer), f)