In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the training dataset (this dataset will be used for training the model)
train_data_path = r'C:\Users\HP\Downloads\tam_training_data_hum_ai.csv'  # Path to training data
train_data = pd.read_csv(train_data_path)

# Display the first few rows to understand the structure of the training dataset
print("Training Data Preview:")
print(train_data.head())

# Check for missing values and handle them if necessary
print("Missing values in training data:")
print(train_data.isnull().sum())
train_data.dropna(inplace=True)

# Separate features and labels for training dataset
X_train = train_data['DATA']  # Assuming the text reviews are in a column named 'DATA'
y_train = train_data['LABEL']   # Assuming labels ('AI' or 'HUMAN') are in a column named 'LABEL'

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Text vectorization using TF-IDF for the training data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_val_tfidf = vectorizer.transform(X_val).toarray()

# Convert labels to binary (0 for HUMAN, 1 for AI)
y_train = y_train.map({'HUMAN': 0, 'AI': 1})
y_val = y_val.map({'HUMAN': 0, 'AI': 1})

# Train an XGBoost classifier
model = XGBClassifier(eval_metric='logloss')
model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set to evaluate the model
y_val_pred = model.predict(X_val_tfidf)

# Evaluate the model on the validation data
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))

# Now load the prediction dataset (Excel file)
predict_data_path = r'C:\Users\HP\Downloads\tam_test_data_hum_ai.xlsx'  # Path to prediction data (Excel file)
predict_data = pd.read_excel(predict_data_path)  # Use pd.read_excel for Excel files

# Display the first few rows to understand the structure of the prediction dataset
print("Prediction Data Preview:")
print(predict_data.head())

# Check for missing values and handle them if necessary
print("Missing values in prediction data:")
print(predict_data.isnull().sum())
predict_data.dropna(inplace=True)

# Separate features for prediction dataset (assuming it's in the same format as the training data)
X_predict = predict_data['DATA']  # Assuming the column with text data is 'DATA'

# Text vectorization using the same TF-IDF vectorizer
X_predict_tfidf = vectorizer.transform(X_predict).toarray()

# Make predictions on the new dataset
predictions = model.predict(X_predict_tfidf)

# Map predictions back to labels ('AI' and 'HUMAN')
# Map predictions back to labels ('AI' and 'HUMAN') using pandas Series indexing
predict_data['prediction'] = ['HUMAN' if pred == 0 else 'AI' for pred in predictions]

# Save the predictions to a new CSV file
output_path = r'C:\Users\HP\Downloads\Tam_XGBoost.csv'  # Path to save the output CSV file
predict_data.to_csv(output_path, index=False)  # Use to_csv to save the data as a CSV file
print(f"Predictions saved to {output_path}")


Training Data Preview:
                ID                                               DATA LABEL
0  TAM_HUAI_TR_001  இந்த சோப்பின் மணம் மிகவும் புத்துணர்ச்சியூட்டு...    AI
1  TAM_HUAI_TR_002   தோலை நன்கு சுத்தம் செய்ய இது மிகவும் சிறப்பானது.    AI
2  TAM_HUAI_TR_003  இதைப் பயன்படுத்திய பிறகு, தோல் மிக மென்மையாக உ...    AI
3  TAM_HUAI_TR_004  இந்த சோப்பில் இயற்கையான மூலப்பொருட்கள் பயன்படு...    AI
4  TAM_HUAI_TR_005        சிறிது சோப்பு போதும், அதிக நுரை உருவாகிறது.    AI
Missing values in training data:
ID       0
DATA     0
LABEL    0
dtype: int64
Validation Accuracy: 0.8395061728395061
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.79      0.82        76
           1       0.83      0.88      0.85        86

    accuracy                           0.84       162
   macro avg       0.84      0.84      0.84       162
weighted avg       0.84      0.84      0.84       162

Prediction Data Preview:
                