<a href="https://colab.research.google.com/github/SrinidhiNagaraju15/My-project/blob/main/Ai_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk
import nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, classification_report
import string

# Step 2: Download necessary NLTK resources (No Error Messages)
nltk_data_packages = [
    "punkt", "stopwords", "wordnet", "omw-1.4", "averaged_perceptron_tagger"
]
for package in nltk_data_packages:
    nltk.download(package, quiet=True)  # No messages, no prompts

# Step 3: Load dataset
df = pd.read_csv('/content/DatasetReviewsAndSentiments.csv')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.discard('not')  # Ensure 'not' is not removed

# Step 4: Preprocessing function (FIXED)
def PreProcessText(review):
    if not isinstance(review, str) or review.strip() == "":  # Handle missing/NaN values
        return ""

    tokens = word_tokenize(review.lower())  # Tokenization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['Review'] = df['Review'].apply(PreProcessText)

# Step 5: TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(df['Review'])
y = df['Label']

print(y.value_counts())

# Step 6: Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

# Step 7: Model training with GridSearchCV
model = MultinomialNB()
param_grid = {'alpha': [0.1, 0.5, 0.7], 'fit_prior': [True, False]}
stratified_kfold = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(model, param_grid, cv=stratified_kfold, scoring='f1_weighted')
grid_search.fit(X_train, y_train)

# Step 8: Model evaluation
print(f"Best parameters: {grid_search.best_params_}")
y_pred = grid_search.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f"Confusion Matrix:\n{cm}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 9: Prediction function
def predict_sentiment(review):
    review_processed = PreProcessText(review)
    review_vectorized = vectorizer.transform([review_processed])
    prediction = grid_search.predict(review_vectorized)
    return prediction[0]

# Step 10: Sample reviews for testing
reviews = [
    "This app is amazing! I've never had any issues and it works perfectly.",
    "Scam! They charged me without my consent and the app doesn't even work.",
    "Great app for tracking my workouts. Highly recommend it to anyone!",
    "Totally fake! This app is just a clone of another one and it crashes all the time.",
    "I've been using this app for months and it's been very reliable and useful.",
    "Warning! This app stole my personal information. Do not download.",
    "Fantastic user interface and very helpful customer support.",
    "Terrible. It's filled with ads and doesn't do what it promises.",
    "Love the new features in the latest update. Well done!",
    "Fake reviews everywhere. This app is a complete fraud."
]

for review in reviews:
    predicted_sentiment = predict_sentiment(review)
    print(f"Review: {review}")
    print(f"Predicted sentiment: {predicted_sentiment}")
    if predicted_sentiment == "negative":
        print("App is Fraud")
    else:
        print("App is Not Fraud")

# Step 11: User input for review analysis
new_review = input("Enter a review to detect its sentiment: ")
predicted_sentiment = predict_sentiment(new_review)
print(f"Predicted sentiment for the review: {predicted_sentiment}")
if predicted_sentiment == "negative":
    print("App is Fraud")
else:
    print("App is Not Fraud")

# Step 12: Function to display TF-IDF scores
import numpy as np

import numpy as np

def display_tfidf_scores(review):
    # Preprocess the input review
    review_processed = PreProcessText(review)

    # Tokenize words manually
    words = review_processed.split()

    # Transform the review using the trained TF-IDF vectorizer
    review_vectorized = vectorizer.transform([review_processed])

    # Get feature names from TF-IDF vectorizer
    feature_names = np.array(vectorizer.get_feature_names_out())  # Convert to NumPy array for safe indexing

    # Convert sparse matrix to array and flatten it
    tfidf_scores = review_vectorized.toarray().flatten()

    # Get indices where TF-IDF scores are nonzero
    nonzero_indices = tfidf_scores.nonzero()[0]  # Only take features that exist

    # Create dictionary of word/ngram -> TF-IDF score
    tfidf_dict = {feature_names[i]: tfidf_scores[i] for i in nonzero_indices}

    # Handle out-of-vocabulary (OOV) words
    oov_words = [word for word in words if word not in feature_names]
    for oov in oov_words:
        tfidf_dict[oov] = 0.0001  # Assign small weight to unseen words

    # Sort dictionary by TF-IDF scores in descending order
    tfidf_dict_sorted = dict(sorted(tfidf_dict.items(), key=lambda item: item[1], reverse=True))

    # Display TF-IDF scores
    print(f"\nTF-IDF scores for review: '{review}'")
    for word, score in tfidf_dict_sorted.items():
        print(f"{word}: {score}")

# Call function to display TF-IDF scores for user input
display_tfidf_scores(new_review)




Label
negative    261
positive    239
Name: count, dtype: int64
Best parameters: {'alpha': 0.1, 'fit_prior': True}
Accuracy: 0.5151515151515151
Confusion Matrix:
[[41 45]
 [35 44]]
Precision: 0.5178843431356737
Recall: 0.5151515151515151
F1 Score: 0.5146170701726257

Classification Report:
              precision    recall  f1-score   support

    negative       0.54      0.48      0.51        86
    positive       0.49      0.56      0.52        79

    accuracy                           0.52       165
   macro avg       0.52      0.52      0.51       165
weighted avg       0.52      0.52      0.51       165

Review: This app is amazing! I've never had any issues and it works perfectly.
Predicted sentiment: positive
App is Not Fraud
Review: Scam! They charged me without my consent and the app doesn't even work.
Predicted sentiment: negative
App is Fraud
Review: Great app for tracking my workouts. Highly recommend it to anyone!
Predicted sentiment: negative
App is Fraud
Review: Totally