In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [11]:

# Sample data (Replace with your actual data)
# data = {
#     'sentence': ["I love this movie!", "This movie is terrible.", "The acting was great."],
#     'sentiment': ['positive', 'negative', 'positive']
# }
# df = pd.DataFrame(data)

PATH = r"tweets\train2.csv"

df = pd.read_csv(PATH)
df.head()



Unnamed: 0,selected_text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD,negative
2,bullying me,negative
3,leave me alone,negative
4,"Sons of ****,",negative


In [12]:
df['selected_text'] = df['selected_text'].astype(str)

In [13]:

# Tokenization, Lemmatization, Stopwords Removal, POS Tagging
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    return " ".join(tokens)

df['processed_sentence'] = df['selected_text'].apply(preprocess_text)

# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_sentence'])
y = df['sentiment']



In [14]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [15]:
# Training Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)



In [16]:
# Evaluation
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



Accuracy: 0.7791522648717483


In [20]:
new_test=[
    "i love pizza",
    "this wine taste's great",
    "i am going to college in afternoon",
    "i will kill you",
]
new_y_pred = nb_classifier.predict(new_test)
new_y_pred

ValueError: Expected 2D array, got 1D array instead:
array=['i love pizza' "this wine taste's great"
 'i am going to college in afternoon' 'i will kill you'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [19]:
df.to_csv("new.csv")

In [17]:
# Creating Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])


In [18]:

# Training Pipeline
pipeline.fit(df['processed_sentence'], df['sentiment'])



In [None]:
# Saving the pipeline for deployment
from joblib import dump
dump(pipeline, 'sentiment_analysis_pipeline.joblib')