In [5]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [7]:
# Wczytaj dane
with open('positive.txt', 'r', encoding='utf-8') as file:
 positive_data = file.readlines()
with open('negative.txt', 'r', encoding='utf-8') as file:
 negative_data = file.readlines()

# Przydziel klasy (0 - positive, 1 - negative)
positive_df = pd.DataFrame({'text': positive_data, 'class': 0})
negative_df = pd.DataFrame({'text': negative_data, 'class': 1})

# Połącz ramki danych
df = pd.concat([positive_df, negative_df], ignore_index=True)

# Przemieszaj zbiór danych
df = shuffle(df)
df.head()

Unnamed: 0,text,class
142,"Męczące monologi postaci. Zbyt wiele gadania, ...",1
182,"Film, który traci swój klimat w drugiej połowi...",1
183,Banalna i przewidywalna fabuła. Nie ma momentó...,1
168,Wstrząsające sceny przemocy. Nie każdy jest go...,1
113,Brak spójności w narracji. Film nie trzyma...,1


Dzielenie danych na zbiór uczący i testowy (20%)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class'], test_size=0.2, random_state=42)


Pipeline klasyfikatorów

In [27]:
classifiers = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('SVM', SVC())
]

vectorizerTokenPatterns = [
    ('Every word', CountVectorizer()),
    ('More than 3 characters', CountVectorizer(token_pattern=r'\b\w{3,}\b')), # any word that: 3 <= characters
    ('More than 2 characters', CountVectorizer(token_pattern=r'\b\w{2,}\b')) # any word that: 2 <= characters
]

Pętla do testowania wszystkich zdefiniowanych klasyfikatorów pod względem ich dokładności

In [28]:
results = []
for classifier_name, classifier in classifiers:
    for pattern_desc, pattern in vectorizerTokenPatterns:
        pipeline = Pipeline([
            ('vectorizer', pattern),
            ('classifier', classifier)
        ])
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')

        # Trenuj model
        pipeline.fit(X_train, y_train)

        # Przewiduj na danych testowych
        y_pred = pipeline.predict(X_test)

        # Oceniaj wyniki
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        # Dodaj wyniki do listy
        results.append({
            'Classifier': classifier_name,
            'Pattern': pattern_desc,
            'Mean Accuracy': cv_scores.mean(),
            'Cross-Validation Scores': cv_scores,
            'Classification Report': report
        })


Wyniki

In [30]:
for result in results:
    print(f"Classifier: {result['Classifier']}")
    print(f"Pattern description: {result['Pattern']}")
    print(f"Cross-Validation Scores: {result['Cross-Validation Scores']}")
    print(f"Mean CV Accuracy: {result['Mean Accuracy']:.4f}")
    print("Classification Report:")
    print(result['Classification Report'])
    print("=" * 50)

Classifier: Decision Tree
Pattern description: Every word
Cross-Validation Scores: [0.76666667 0.86666667 0.73333333 0.75862069 0.72413793]
Mean CV Accuracy: 0.7699
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.94      0.79        16
           1       0.93      0.67      0.78        21

    accuracy                           0.78        37
   macro avg       0.81      0.80      0.78        37
weighted avg       0.82      0.78      0.78        37

Classifier: Decision Tree
Pattern description: More than 3 characters
Cross-Validation Scores: [0.83333333 0.83333333 0.8        0.75862069 0.75862069]
Mean CV Accuracy: 0.7968
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.94      0.79        16
           1       0.93      0.67      0.78        21

    accuracy                           0.78        37
   macro avg       0.81      0.80      0.78        37
weighted avg 

TypeError: list indices must be integers or slices, not str