# Import Libraries

In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
import re


/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocessing

In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'<br />', '', text)  
    text = re.sub(r'http\S+', '', text) 
    text = re.sub(r'[^\w\s]', '', text)  
    text = ' '.join([word for word in text.split() if word not in stop_words]) 
    return text

df['clean_review'] = df['review'].apply(clean_text)

df[['review', 'clean_review']].head()

df.drop(columns = 'review', inplace = True)


In [5]:
# Etiketleri sayısal hale getirelim
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})


# Feature Extraction

In [6]:
# TF-IDF kullanarak özellik çıkarımı yapalım
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_review']).toarray()

# Hedef değişken (etiketler)
y = df['sentiment']


# Train-Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Models

## Logistic Regression

In [8]:
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

y_pred_logistic_test = logistic_model.predict(X_test)
y_pred_logistic_train = logistic_model.predict(X_train)

print(classification_report(y_test, y_pred_logistic_test))
print("**************************************************")
print(classification_report(y_train, y_pred_logistic_train))

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

**************************************************
              precision    recall  f1-score   support

           0       0.92      0.90      0.91     20039
           1       0.91      0.92      0.91     19961

    accuracy                           0.91     40000
   macro avg       0.91      0.91      0.91     40000
weighted avg       0.91      0.91      0.91     40000



## SVM

In [9]:
"""svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred_svm_test = svm_model.predict(X_test)
y_pred_svm_train = svm_model.predict(X_train)

print(classification_report(y_test, y_pred_svm_test))
print("**************************************************")
print(classification_report(y_train, y_pred_svm_train))
"""

'svm_model = SVC(kernel=\'linear\')\nsvm_model.fit(X_train, y_train)\n\ny_pred_svm_test = svm_model.predict(X_test)\ny_pred_svm_train = svm_model.predict(X_train)\n\nprint(classification_report(y_test, y_pred_svm_test))\nprint("**************************************************")\nprint(classification_report(y_train, y_pred_svm_train))\n'

## Random Forest

In [10]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf_test = rf_model.predict(X_test)
y_pred_rf_train = rf_model.predict(X_train)


print(classification_report(y_test, y_pred_rf_test))
print("**************************************************")
print(classification_report(y_train, y_pred_rf_train))


              precision    recall  f1-score   support

           0       0.84      0.86      0.85      4961
           1       0.86      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

**************************************************
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20039
           1       1.00      1.00      1.00     19961

    accuracy                           1.00     40000
   macro avg       1.00      1.00      1.00     40000
weighted avg       1.00      1.00      1.00     40000



## XGBoost

In [12]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb_test = xgb_model.predict(X_test)
y_pred_xgb_train = xgb_model.predict(X_train)

print(classification_report(y_test, y_pred_xgb_test))
print("**************************************************")
print(classification_report(y_train, y_pred_xgb_train))


              precision    recall  f1-score   support

           0       0.87      0.83      0.85      4961
           1       0.84      0.88      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

**************************************************
              precision    recall  f1-score   support

           0       0.94      0.90      0.92     20039
           1       0.91      0.94      0.92     19961

    accuracy                           0.92     40000
   macro avg       0.92      0.92      0.92     40000
weighted avg       0.92      0.92      0.92     40000



## Naive Bayes

In [13]:
# Naive Bayes modeli
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Test verisi üzerinde tahmin yap
y_pred_nb_test = nb_model.predict(X_test)
y_pred_nb_train = nb_model.predict(X_train)


print(classification_report(y_test, y_pred_nb_test))
print("**************************************************")
print(classification_report(y_train, y_pred_nb_train))


              precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.86      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

**************************************************
              precision    recall  f1-score   support

           0       0.87      0.86      0.86     20039
           1       0.86      0.87      0.86     19961

    accuracy                           0.86     40000
   macro avg       0.86      0.86      0.86     40000
weighted avg       0.86      0.86      0.86     40000

