In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, Dense, Dropout, MaxPooling1D, Flatten
from transformers import BertTokenizer, TFBertForSequenceClassification

data = pd.read_csv("cleanData.csv") 
X = data['cleanText']
y = data['label']







In [3]:
from TurkishStemmer import TurkishStemmer
import re
import nltk
from nltk.corpus import stopwords

In [4]:
turk_stem = TurkishStemmer()
turk_stem

<TurkishStemmer.TurkishStemmer at 0x282a05b6a90>

In [5]:
def stemming(content):
    review = re.sub('[^A-ZĞÜŞİÇÖ^a-zığüşöç]',' ',content)
    review = review.lower()
    review = review.split()
    review = [turk_stem.stem(word) for word in review if not word in stopwords.words('turkish')]
    review = ' '.join(review)
    return review

In [6]:
data['cleanText'] = data['cleanText'].apply(stemming)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

data = data.dropna(subset=['cleanText', 'label']).iloc[:40000]
data = data[data['cleanText'].str.strip() != '']

text_data = data['cleanText']
labels = data['label']

tfidf_vectorizer = TfidfVectorizer(max_features=30000)
X_tfidf = tfidf_vectorizer.fit_transform(text_data)

svd = TruncatedSVD(n_components= 400, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_encoded, test_size=0.3, random_state=42)

logistic_model = LogisticRegression(max_iter=500, random_state=42)
logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

target_names = [str(cls) for cls in label_encoder.classes_]

print(classification_report(y_test, y_pred, target_names=target_names))

Test Accuracy: 0.8681666666666666
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      6719
           1       0.95      0.74      0.83      5281

    accuracy                           0.87     12000
   macro avg       0.89      0.85      0.86     12000
weighted avg       0.88      0.87      0.87     12000



In [8]:
data.head()

Unnamed: 0,cleanText,label
0,gerçek siz hikaye izleyerek mi yen yıl girice,0
1,user çoook bi baklav bi sen zaten,0
2,sn dükel atatürk karm e m başlattık siyasi bağ...,0
3,konfederasyon ail sosyal politika bakanlık ist...,0
4,hakem tarih yazıyor kişi karş ön olacak alanya...,1


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import pandas as pd

tfidf_vectorizer = TfidfVectorizer(max_features=300000)
X_tfidf = tfidf_vectorizer.fit_transform(text_data)

svd = TruncatedSVD(n_components= 32, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)
y_categorical = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_categorical, test_size=0.3, random_state=42)

model = Sequential([
    Dense(256, input_dim= 32, activation='relu'),
    Dense(128, activation='relu'),
    Dense(y_categorical.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 989us/step - accuracy: 0.7948 - loss: 0.4486 - val_accuracy: 0.8384 - val_loss: 0.3720
Epoch 2/50
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 859us/step - accuracy: 0.8486 - loss: 0.3593 - val_accuracy: 0.8468 - val_loss: 0.3575
Epoch 3/50
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 929us/step - accuracy: 0.8480 - loss: 0.3588 - val_accuracy: 0.8463 - val_loss: 0.3571
Epoch 4/50
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 971us/step - accuracy: 0.8485 - loss: 0.3531 - val_accuracy: 0.8501 - val_loss: 0.3535
Epoch 5/50
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 934us/step - accuracy: 0.8559 - loss: 0.3444 - val_accuracy: 0.8497 - val_loss: 0.3525
Epoch 6/50
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 899us/step - accuracy: 0.8554 - loss: 0.3439 - val_accuracy: 0.8503 - val_loss: 0.3504
Epoch 7/50
[1m875/875[0m 

In [16]:
from sklearn.svm import SVC

tfidf_vectorizer = TfidfVectorizer(max_features=30000)
X_tfidf = tfidf_vectorizer.fit_transform(text_data)

svd = TruncatedSVD(n_components=400, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_encoded, test_size=0.3, random_state=42)

svm_model = SVC(kernel='linear', random_state=42)

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

print(classification_report(y_test, y_pred, target_names=[str(label) for label in label_encoder.classes_]))


Test Accuracy: 0.8665833333333334
              precision    recall  f1-score   support

           0       0.82      0.98      0.89      6719
           1       0.97      0.72      0.83      5281

    accuracy                           0.87     12000
   macro avg       0.89      0.85      0.86     12000
weighted avg       0.88      0.87      0.86     12000



In [20]:
from sklearn.neighbors import KNeighborsClassifier

tfidf_vectorizer = TfidfVectorizer(max_features=30000)
X_tfidf = tfidf_vectorizer.fit_transform(text_data)

svd = TruncatedSVD(n_components=400, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_encoded, test_size=0.3, random_state=42)

knn_model = KNeighborsClassifier(n_neighbors=5) 

knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

print(classification_report(y_test, y_pred, target_names=[str(label) for label in label_encoder.classes_]))


Test Accuracy: 0.7600833333333333
              precision    recall  f1-score   support

           0       0.72      0.94      0.81      6719
           1       0.87      0.53      0.66      5281

    accuracy                           0.76     12000
   macro avg       0.80      0.74      0.74     12000
weighted avg       0.79      0.76      0.75     12000



In [23]:
from sklearn.tree import DecisionTreeClassifier

tfidf_vectorizer = TfidfVectorizer(max_features=30000)
X_tfidf = tfidf_vectorizer.fit_transform(text_data)

svd = TruncatedSVD(n_components=400, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_encoded, test_size=0.3, random_state=42)

decision_tree_model = DecisionTreeClassifier(random_state=42, max_depth=100)  # max_depth ile ağacın derinliği sınırlandı

decision_tree_model.fit(X_train, y_train)

y_pred = decision_tree_model.predict(X_test)

print(f"Test Accuracy: {accuracy}")

print(classification_report(y_test, y_pred, target_names=[str(label) for label in label_encoder.classes_]))


Test Accuracy: 0.7985833333333333
              precision    recall  f1-score   support

           0       0.77      0.76      0.76      6719
           1       0.70      0.71      0.70      5281

    accuracy                           0.74     12000
   macro avg       0.73      0.73      0.73     12000
weighted avg       0.74      0.74      0.74     12000

