In [2]:
import os

In [3]:
os.environ["KAGGLE CONFIG_DIR"]='/content'

In [5]:
!kaggle datasets download -d mustfkeskin/turkish-movie-sentiment-analysis-dataset

Dataset URL: https://www.kaggle.com/datasets/mustfkeskin/turkish-movie-sentiment-analysis-dataset
License(s): CC0-1.0
turkish-movie-sentiment-analysis-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
!unzip /content/turkish-movie-sentiment-analysis-dataset.zip

Archive:  /content/turkish-movie-sentiment-analysis-dataset.zip
replace turkish_movie_sentiment_dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:

df = pd.read_csv('/content/turkish_movie_sentiment_dataset.csv')  # Adjust path as needed


In [8]:
# Explore the dataset
print(df.head())


                                             comment      film_name point
0  \n                      Jean Reno denince zate...  Sevginin Gücü   5,0
1  \n                      Ekşın falan izlemek is...  Sevginin Gücü   5,0
2  \n                      Bu yapım hakkında öyle...  Sevginin Gücü   5,0
3  \n                      finali yeter... (sting...  Sevginin Gücü   5,0
4  \n                      Jean Reno..\r\nbu adam...  Sevginin Gücü   5,0


In [9]:
print(df.tail())

                                                 comment  \
83222  \n                      Böyle bi kadrodan, bçy...   
83223  \n                      yani bu kaar ii oyunca...   
83224  \n                      bugün dvd'sini alıp iz...   
83225  \n                      Klasik korku ve gerili...   
83226  \n                      Bence gereğinden fazla...   

                          film_name point  
83222                         Kabus   2,5  
83223                         Kabus   2,5  
83224                         Kabus   1,0  
83225                         Kabus   4,0  
83226  Bir Zamanlar... Hollywood'da   4,6  


In [10]:
print(df.columns)

Index(['comment', 'film_name', 'point'], dtype='object')


In [11]:
print(df['comment'].value_counts())

comment
\n        \n                                                                                                                                                                                                                                                                                                                                                                                                                           300
\n                                                                                                                                                                                                                                                                                                                                                                                                                                     163
\n                                                                                                                                        

In [16]:
print(df['film_name'].value_counts())

film_name
Kapan                            92
Köstebek                         76
Cinnet                           64
Deney                            61
Şüphe                            54
                                 ..
Drakula: Ölü ve Mutlu             1
Karlar Kraliçesi 2                1
Kursk                             1
Gassal                            1
Max Maceraları: Kralın Doğuşu     1
Name: count, Length: 7722, dtype: int64


In [12]:
# Preprocessing
stop_words = set(stopwords.words('turkish'))
def preprocess_text(text):
    # Basic text preprocessing
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df['processed_text'] = df['comment'].apply(preprocess_text)

In [13]:
# Split the dataset
X = df['processed_text']
y = df['film_name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
print(type(X_train))
print(type(y_train))
print(X_train.head())
print(y_train.head())


<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
13037    kankamın tavsiyesi üzerine izledim biraz önyar...
81128    arkdaşım dikkat edersen yukarıda gösterim tari...
61294              steve buscemi adamım harikasın yine....
51619    bence etkıleyıcı cok fazla beklentıyle ızlemye...
37732    filme yorum yazmayı unutmuşum oysaki yorum yaz...
Name: processed_text, dtype: object
13037                              Ruhlar Bölgesi
81128    Jonas Brothers: Üç Boyutlu Konser Deneyi
61294                                     Görüşme
51619                                  Fred Claus
37732                                  Babam İçin
Name: film_name, dtype: object


In [15]:
print(X_train.isnull().sum())
print(y_train.isnull().sum())


0
0


In [None]:
# Build and train the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

try:
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
except Exception as e:
    print(f"An error occurred: {e}")