In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load Malayalam training data
data_path = r'C:\Users\HP\Downloads\mal_training_data_hum_ai.csv'  # Replace with your file path
data = pd.read_csv(data_path)

# Preprocessing
X = data['DATA']  # Feature column
y = data['LABEL']  # Target column

# Convert text data to numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency
X_tfidf = tfidf_vectorizer.fit_transform(X).toarray()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# Logistic Regression Classifier
clf = LogisticRegression(max_iter=1000, random_state=42)  # Specify max_iter to ensure convergence
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Add predictions to the original dataframe
data['Predictions'] = clf.predict(tfidf_vectorizer.transform(data['DATA']).toarray())

# Save the predictions to a new CSV file
output_path = "Mal_LogisticRegression(training).csv"
data.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")


Accuracy: 0.74
Classification Report:
              precision    recall  f1-score   support

          AI       0.77      0.70      0.73        80
       HUMAN       0.72      0.79      0.75        80

    accuracy                           0.74       160
   macro avg       0.75      0.74      0.74       160
weighted avg       0.75      0.74      0.74       160

Predictions saved to Mal_LogisticRegression(training).csv


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load Malayalam training data
data_path = r'C:\Users\HP\Downloads\mal_training_data_hum_ai.csv'  # Replace with your file path
data = pd.read_csv(data_path)

# Preprocessing
X = data['DATA']  # Feature column
y = data['LABEL']  # Target column

# Convert text data to numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency
X_tfidf = tfidf_vectorizer.fit_transform(X).toarray()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# Logistic Regression Classifier
clf = LogisticRegression(max_iter=1000, random_state=42)  # Specify max_iter to ensure convergence
clf.fit(X_train, y_train)

# Make predictions on the validation set (if needed)
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Add predictions to the original training data
data['Predictions'] = clf.predict(tfidf_vectorizer.transform(data['DATA']).toarray())

# Save the predictions to a new CSV file
output_path = "Mal_LogisticRegression(training).csv"
data.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

# Now for the test data (no LABEL column)
test_data_path = r'C:\Users\HP\Downloads\mal_test_data_hum_ai.xlsx'  # Excel file path
test_data = pd.read_excel(test_data_path)

# Convert test data 'DATA' column to TF-IDF features using the already fitted vectorizer
X_test_tfidf = tfidf_vectorizer.transform(test_data['DATA']).toarray()

# Predict the labels for the test data (AI or HUMAN)
test_predictions = clf.predict(X_test_tfidf)

# Add the predictions to the test data
test_data['Predictions'] = test_predictions

# Save the predictions to a new CSV file
test_output_path = "Mal_LogisticRegression.csv"
test_data.to_csv(test_output_path, index=False)
print(f"Test predictions saved to {test_output_path}")


Accuracy: 0.74
Classification Report:
              precision    recall  f1-score   support

          AI       0.77      0.70      0.73        80
       HUMAN       0.72      0.79      0.75        80

    accuracy                           0.74       160
   macro avg       0.75      0.74      0.74       160
weighted avg       0.75      0.74      0.74       160

Predictions saved to Mal_LogisticRegression(training).csv
Test predictions saved to Mal_LogisticRegression.csv
