In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Load the datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
print(train), print(test)

         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfires evacuation or...       1  
4     Just got sent this photo from Ruby #Alaska as ...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheTawniest The out of control w...       1  
7610  M1.94 [01:04 UT

(None, None)

In [4]:
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train["text"] = train["text"].apply(preprocess_text)
test["text"] = test["text"].apply(preprocess_text)
train["keyword"] = train["keyword"].fillna("").apply(preprocess_text)
test["keyword"] = test["keyword"].fillna("").apply(preprocess_text)

train["combined_text"] = train["keyword"] + " " + train["text"]
test["combined_text"] = test["keyword"] + " " + test["text"]

In [5]:
X = train["combined_text"]
y = train["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [7]:
y_val_pred = model.predict(X_val_tfidf)
accuracy = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
print("Accuracy:", accuracy)
print("F1 score:", f1)

Accuracy: 0.8010505581089954
F1 score: 0.7518427518427518


In [8]:
#做出预测
X_test = test["combined_text"]
X_test_tfidf = vectorizer.transform(X_test)
test_preds = model.predict(X_test_tfidf)

In [9]:
print("测试数据的预测结果:", test_preds)

测试数据的预测结果: [1 0 1 ... 1 1 0]
