In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump

# Load the dataset
data_path = r'C:\Users\HP\Downloads\mal_training_data_hum_ai.csv'  # Update path
data = pd.read_csv(data_path)

# Display the first few rows to understand the structure
print(data.head())

# Check for missing values and handle them if necessary
print("Missing values:")
print(data.isnull().sum())
data.dropna(inplace=True)

# Separate features and labels
if 'DATA' not in data.columns or 'LABEL' not in data.columns:
    raise ValueError("Dataset must have 'DATA' and 'LABEL' columns.")
X = data['DATA']  # Text reviews
y = data['LABEL']  # Labels ('AI' or 'HUMAN')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), analyzer='char_wb')  # Add char_wb for better handling
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the vectorizer and model for future use
vectorizer_path = r'C:\Users\HP\vectorizer_malayalam.joblib'
model_path = r'C:\Users\HP\model_malayalam.joblib'
dump(vectorizer, vectorizer_path)
dump(model, model_path)
print(f"Vectorizer saved to {vectorizer_path}")
print(f"Model saved to {model_path}")

# Predictions for the entire dataset
full_tfidf = vectorizer.transform(X)
all_predictions = model.predict(full_tfidf)

# Add predictions to the original dataset
data['prediction'] = all_predictions

# Save the predictions to a CSV file
output_path = r'C:\Users\HP\Mal_MNB(training).csv'  # Update path
data.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")


                ID                                               DATA  LABEL
0  MAL_HUAI_TR_001  ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...  HUMAN
1  MAL_HUAI_TR_002           ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം  HUMAN
2  MAL_HUAI_TR_003  അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...  HUMAN
3  MAL_HUAI_TR_004  ബ്രോ ഇതെല്ലം യൂസ്  ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറ...  HUMAN
4  MAL_HUAI_TR_005    ഇത് ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലതാ  HUMAN
Missing values:
ID       0
DATA     0
LABEL    0
dtype: int64
Accuracy: 0.8625
Classification Report:
              precision    recall  f1-score   support

          AI       0.85      0.89      0.87        80
       HUMAN       0.88      0.84      0.86        80

    accuracy                           0.86       160
   macro avg       0.86      0.86      0.86       160
weighted avg       0.86      0.86      0.86       160

Vectorizer saved to C:\Users\HP\vectorizer_malayalam.joblib
Model saved to C:\Users\HP\model_malayalam.joblib

In [3]:
# Load the pre-trained Malayalam vectorizer and model
vectorizer_malayalam = load(r'C:\Users\HP\vectorizer_malayalam.joblib')  # Path to Malayalam vectorizer
model_malayalam = load(r'C:\Users\HP\model_malayalam.joblib')  # Path to Malayalam model

# Load the Malayalam test dataset
test_data_path = r'C:\Users\HP\Downloads\mal_test_data_hum_ai.xlsx'
test_data = pd.read_excel(test_data_path)

# Apply tokenization (same as before)
test_data['DATA'] = test_data['DATA'].apply(tokenize_malayalam)

# Transform using Malayalam vectorizer
X_test_data_tfidf = vectorizer_malayalam.transform(test_data['DATA'])

# Make predictions
test_predictions = model_malayalam.predict(X_test_data_tfidf)

# Add predictions to the test dataset
test_data['prediction'] = test_predictions

# Save the predictions
output_test_path = r'C:\Users\HP\Downloads\Malayalam_MNB_Predictions.csv'
test_data.to_csv(output_test_path, index=False)

print(f"Malayalam predictions saved to {output_test_path}")


Malayalam predictions saved to C:\Users\HP\Downloads\Malayalam_MNB_Predictions.csv
