In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the CSV file
data = pd.read_csv('LatestDataSet.csv')

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Extract the embeddings and labels
X = data['Embedding'].apply(eval).tolist()
y = data['Label']

# Apply PCA to the embeddings
pca = PCA(n_components=50)  # Adjust n_components as needed
X_pca = pca.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.5, random_state=42)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')

# Add predictions as a new column to the original data
data['predictions'] = model.predict(pca.transform(data['Embedding'].apply(eval).tolist()))

# Save the updated DataFrame to a new CSV file
output_file_path = 'newoutput.csv'
data.to_csv(output_file_path, index=False)


Accuracy: 1.0
Confusion Matrix:
[[12  0]
 [ 0 13]]
