In [1]:
import pandas as pd

# Load train and test
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Peek into data
print(train_df.head())


Train shape: (2029, 9)
Test shape: (10, 8)
   row_id                                               body  \
0       0  Banks don't want you to know this! Click here ...   
1       1  SD Stream [ ENG Link 1] (http://www.sportsstre...   
2       2  Lol. Try appealing the ban and say you won't d...   
3       3  she will come your home open her legs with  an...   
4       4  code free tyrande --->>> [Imgur](http://i.imgu...   

                                                rule      subreddit  \
0  No Advertising: Spam, referral links, unsolici...     Futurology   
1  No Advertising: Spam, referral links, unsolici...  soccerstreams   
2  No legal advice: Do not offer or request legal...   pcmasterrace   
3  No Advertising: Spam, referral links, unsolici...            sex   
4  No Advertising: Spam, referral links, unsolici...    hearthstone   

                                  positive_example_1  \
0  If you could tell your younger self something ...   
1  [I wanna kiss you all over! St

In [2]:
# Features and target from train set
X_train = train_df[["body", "subreddit", "rule"]]
y_train = train_df["rule_violation"]

# Test set (no labels)
X_test = test_df[["body", "subreddit", "rule"]]


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize 'body' text
vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)

X_train_tfidf = vectorizer.fit_transform(X_train["body"])
X_test_tfidf = vectorizer.transform(X_test["body"])

print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Test TF-IDF shape:", X_test_tfidf.shape)


Train TF-IDF shape: (2029, 8350)
Test TF-IDF shape: (10, 8350)


In [4]:
from sklearn.linear_model import LogisticRegression

# Train model
clf = LogisticRegression(max_iter=500)
clf.fit(X_train_tfidf, y_train)

# Predict on test
y_pred = clf.predict(X_test_tfidf)

# If you have test labels, you can evaluate:
# from sklearn.metrics import classification_report
# print(classification_report(y_test, y_pred))


In [5]:
# Attach predictions to test set
test_df["predicted_rule_violation"] = y_pred

# Save to CSV
test_df.to_csv("test_with_predictions.csv", index=False)
