In [None]:
import pandas as pd

# 加载数据
train_data = pd.read_csv('data/easy_train.csv')
test_data = pd.read_csv('data/easy_validation.csv')

# 检查数据
print(train_data.head())


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.spatial.distance import cosine
import numpy as np

# 自定义转换器来处理两篇文章
class ArticlePairTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, ngram_range=(1, 2)):
        self.vectorizer = TfidfVectorizer(ngram_range=ngram_range)

    def fit(self, X, y=None):
        # 将两篇文章合并，拟合向量化器
        articles = list(X['sentence_1']) + list(X['sentence_2'])
        self.vectorizer.fit(articles)
        return self

    def transform(self, X):
        # 向量化两篇文章
        article1_tfidf = self.vectorizer.transform(X['sentence_1'])
        article2_tfidf = self.vectorizer.transform(X['sentence_2'])
        print(article1_tfidf.shape)
        
        # 计算TF-IDF向量的余弦相似度
        similarities = [cosine(article1_tfidf[i].toarray().flatten(), article2_tfidf[i].toarray().flatten()) for i in range(X.shape[0])]
        
        return pd.DataFrame({'cosine_similarity': similarities})

# 创建Pipeline
pipeline = Pipeline([
    ('features', ArticlePairTransformer(ngram_range=(2, 2))),
])

# 处理特征
X_train_transformed = pipeline.fit_transform(train_data)
X_test_transformed = pipeline.fit_transform(test_data)

In [None]:
# 补齐xtrain和xtest中所有的nan值
X_train_transformed.fillna(0, inplace=True)
X_test_transformed.fillna(0, inplace=True)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

# 分离特征和标签
X_train = X_train_transformed
y_train = train_data['label']
X_test = X_test_transformed
y_test = test_data['label']

# 初始化和训练模型
model = LogisticRegression()
model.fit(X_train, y_train)

# 预测和评估
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print('Classification Report:')
print(classification_rep)


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

# 分离特征和标签
X_train = X_train_transformed
y_train = train_data['label']
X_test = X_test_transformed
y_test = test_data['label']

# 初始化和训练模型
model = SVC()
model.fit(X_train, y_train)

# 预测和评估
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print('Classification Report:')
print(classification_rep)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

# 分离特征和标签
X_train = X_train_transformed
y_train = train_data['label']
X_test = X_test_transformed
y_test = test_data['label']

# 初始化和训练模型
model = KNeighborsClassifier(n_neighbors=1000)
model.fit(X_train, y_train)

# 预测和评估
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print('Classification Report:')
print(classification_rep)
