In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# --------------------------
# 1. Load CSVs
# --------------------------
df1 = pd.read_csv("channel_master_cleaned.csv")
df2 = pd.read_csv("video_summary(clean data)byDE(RR).csv")
df = pd.concat([df1, df2], ignore_index=True)

# --------------------------
# 2. Clean NaNs and create text field
# --------------------------
df['title'] = df['title'].fillna('')
df['shortdescription'] = df['shortdescription'].fillna('')
df['text'] = df['title'].astype(str) + " " + df['shortdescription'].astype(str)

# --------------------------
# 3. Safety labeling rules
# --------------------------
unsafe_keywords = [
    "violence", "kill", "murder", "fight", "sex", "porn", "gambling",
    "drugs", "weapon", "alcohol", "blood", "suicide", "nsfw"
]

def assign_safety(text):
    t = text.lower()
    for word in unsafe_keywords:
        if word in t:
            return "not_safe"
    return "kids_safe"

df['safety'] = df['text'].apply(assign_safety)

# --------------------------
# 4. Train/Test Split
# --------------------------
X = df['text']
y = df['safety']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --------------------------
# 5. Build ML Pipeline
# --------------------------
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=200))
])

# Train model
model.fit(X_train, y_train)

# --------------------------
# 6. Evaluate
# --------------------------
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# --------------------------
# 7. Predict for new video
# --------------------------
def predict_safety(title, description):
    text = title + " " + description
    pred = model.predict([text])[0]
    prob = model.predict_proba([text]).max()
    return pred, prob

# Example test
print(predict_safety("Fun science experiment for kids", "Learn how to make a volcano with baking soda"))
print(predict_safety("Violent fight caught on camera", "Shocking street fight with blood"))


Accuracy: 0.9895038167938931
              precision    recall  f1-score   support

   kids_safe       0.99      1.00      0.99      2074
    not_safe       0.00      0.00      0.00        22

    accuracy                           0.99      2096
   macro avg       0.49      0.50      0.50      2096
weighted avg       0.98      0.99      0.98      2096

('kids_safe', np.float64(0.9902302387706253))
('kids_safe', np.float64(0.9831990807429795))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
