In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, Dense, Dropout, MaxPooling1D, Flatten
from transformers import BertTokenizer, TFBertForSequenceClassification

data = pd.read_csv("train.csv") 
X = data['text']
y = data['label']

tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X).toarray()

pca = PCA(n_components=100)
X_pca = pca.fit_transform(X_tfidf)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier()
}

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred, average='weighted')
    }

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_train, X_test, y_train, y_test)

cnn = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test), verbose=0)

cnn_eval = cnn.evaluate(X_test, y_test, verbose=0)
results['CNN'] = {
    'Precision': cnn_eval[1],
    'Recall': cnn_eval[1],
    'Accuracy': cnn_eval[1],
    'F1-Score': cnn_eval[1]  
}

results_df = pd.DataFrame(results).T
print(results_df)






KeyboardInterrupt: 

In [None]:
from TurkishStemmer import TurkishStemmer

In [None]:
turk_stem = TurkishStemmer()
turk_stem

In [None]:
def stemming(content):
    review = re.sub('[^A-ZĞÜŞİÇÖ^a-zığüşöç]',' ',content)
    review = review.lower()
    review = review.split()
    review = [turk_stem.stem(word) for word in review if not word in stopwords.words('turkish')]
    review = ' '.join(review)
    return review

In [None]:
data['text'] = data['text'].apply(stemming)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Veriyi yükleyelim
data = pd.read_csv("train.csv")

# İlk 30.000 veriyi seçelim ve eksik veya boş verileri temizleyelim
data = data.dropna(subset=['text', 'label']).iloc[:40000]
data = data[data['text'].str.strip() != '']

# Metin verisini ve etiketleri ayıralım
text_data = data['text']
labels = data['label']

# TF-IDF özellik çıkarımı (en çok kullanılan 10000 kelime)
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf_vectorizer.fit_transform(text_data)

# SVD ile boyut indirgeme (100 boyuta indir)
svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)

# Etiketleri sayısal verilere dönüştürme
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

# Eğitim ve test verilerini ayıralım
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_encoded, test_size=0.3, random_state=42)

# Logistic Regression modelini tanımlayalım
logistic_model = LogisticRegression(max_iter=500, random_state=42)
logistic_model.fit(X_train, y_train)

# Tahmin yapalım
y_pred = logistic_model.predict(X_test)

# Modelin başarımını değerlendirelim
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

# Sınıf adlarını string olarak geçelim
target_names = [str(cls) for cls in label_encoder.classes_]

# Sınıflandırma raporu yazdıralım
print(classification_report(y_test, y_pred, target_names=target_names))