In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC  # Importing Support Vector Classifier
from sklearn.metrics import accuracy_score, classification_report

# Load the training dataset (this dataset will be used for training the model)
train_data_path = r'C:\Users\MITHUN CHAKRAVARTHY\Downloads\mal_training_data_hum_ai.csv'  # Path to training data
train_data = pd.read_csv(train_data_path)

# Display the first few rows to understand the structure of the training dataset
print("Training Data Preview:")
print(train_data.head())

# Check for missing values and handle them if necessary
print("Missing values in training data:")
print(train_data.isnull().sum())
train_data.dropna(inplace=True)

# Separate features and labels for training dataset
X_train = train_data['DATA']  # Assuming the text reviews are in a column named 'DATA'
y_train = train_data['LABEL']   # Assuming labels ('AI' or 'HUMAN') are in a column named 'LABEL'

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Text vectorization using TF-IDF for the training data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Train a classifier (Support Vector Machine)
model = SVC(kernel='linear', random_state=42)  # Using linear kernel for SVM
model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set to evaluate the model
y_val_pred = model.predict(X_val_tfidf)

# Evaluate the model on the validation data
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))

# Now load the prediction dataset (Excel file)
predict_data_path = r'C:\Users\MITHUN CHAKRAVARTHY\Downloads\mal_test_data_hum_ai.xlsx'  # Path to prediction data (Excel file)
predict_data = pd.read_excel(predict_data_path)  # Use pd.read_excel for Excel files

# Display the first few rows to understand the structure of the prediction dataset
print("Prediction Data Preview:")
print(predict_data.head())

# Check for missing values and handle them if necessary
print("Missing values in prediction data:")
print(predict_data.isnull().sum())
predict_data.dropna(inplace=True)

# Separate features for prediction dataset (assuming it's in the same format as the training data)
X_predict = predict_data['DATA']  # Assuming the column with text data is 'DATA'

# Text vectorization using the same TF-IDF vectorizer
X_predict_tfidf = vectorizer.transform(X_predict)

# Make predictions on the new dataset
predictions = model.predict(X_predict_tfidf)

# Add predictions to the original prediction dataset
predict_data['prediction'] = predictions

# Save the predictions to a new CSV file
output_path = r'C:\Users\MITHUN CHAKRAVARTHY\Downloads\predictions_mal_svm.csv'  # Path to save the output CSV file
predict_data.to_csv(output_path, index=False)  # Use to_csv to save the data as a CSV file
print(f"Predictions saved to {output_path}")



Training Data Preview:
                ID                                               DATA  LABEL
0  MAL_HUAI_TR_001  ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...  HUMAN
1  MAL_HUAI_TR_002           ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം  HUMAN
2  MAL_HUAI_TR_003  അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...  HUMAN
3  MAL_HUAI_TR_004  ബ്രോ ഇതെല്ലം യൂസ്  ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറ...  HUMAN
4  MAL_HUAI_TR_005    ഇത് ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലതാ  HUMAN
Missing values in training data:
ID       0
DATA     0
LABEL    0
dtype: int64
Validation Accuracy: 0.7875
Validation Classification Report:
              precision    recall  f1-score   support

          AI       0.78      0.80      0.79        80
       HUMAN       0.79      0.78      0.78        80

    accuracy                           0.79       160
   macro avg       0.79      0.79      0.79       160
weighted avg       0.79      0.79      0.79       160

Prediction Data Preview:
                ID    