In [6]:
import pandas as pd
import os

# Set the input directory
input_dir = "output B"

# Get the list of CSV files in the input directory
csv_files = [file for file in os.listdir(input_dir) if file.endswith(".csv")]

# Find the highest numbered CSV file
if csv_files:
    latest_file = max(csv_files, key=lambda x: int(x.split("_")[0]))
    input_path = os.path.join(input_dir, latest_file)
    df = pd.read_csv(input_path)
else:
    print("No CSV files found in the input directory.")
    exit()

# Discard rows where C_Keywords is '~'
df = df[df['C_Keywords'] != '~']

# Create the first DataFrame (unprocessed)
data_list_unprocessed = []

for _, row in df.iterrows():
    c_unique_list = eval(row['C_Unique_List'])
    r_unique_list = eval(row['R_Unique_List'])
    
    data_list_unprocessed.append({'Keyword': ', '.join(c_unique_list), 'Source': 1})
    data_list_unprocessed.append({'Keyword': ', '.join(r_unique_list), 'Source': 0})

df_unprocessed = pd.DataFrame(data_list_unprocessed)

# Create the second DataFrame (processed)
data_list_processed = []

for _, row in df.iterrows():
    if isinstance(row['C_Keywords_Unique'], str):
        c_keywords_unique = row['C_Keywords_Unique'].split(', ')
    else:
        c_keywords_unique = []
    
    if isinstance(row['R_Keywords_Unique'], str):
        r_keywords_unique = row['R_Keywords_Unique'].split(', ')
    else:
        r_keywords_unique = []
    
    data_list_processed.append({'Keyword': ', '.join(c_keywords_unique), 'Source': 1})
    data_list_processed.append({'Keyword': ', '.join(r_keywords_unique), 'Source': 0})

df_processed = pd.DataFrame(data_list_processed)

# Print the DataFrames
print("Unprocessed DataFrame:")
print(df_unprocessed)

print("\nProcessed DataFrame:")
print(df_processed)

Unprocessed DataFrame:
                                                Keyword  Source
0     funny, point, get, results, involve, sorry, pr...       1
1     funny, really, pen, finger, ink, someone, stic...       0
2     touch, stressed, alcohol, emotions, breaths, c...       1
3     alcohol, glad, enjoying, intake, feel, drink, ask       0
4     person, seems, gullible, phone, tries, like, p...       1
...                                                 ...     ...
2455  information, first, dealer, find, need, could,...       0
2456  think, important, legal, fellow, world, challe...       1
2457  like, better, think, treating, would, stopped,...       0
2458  try, best, might, answer, question, anyway, as...       1
2459             beverly, hills, drive, ca, north, sure       0

[2460 rows x 2 columns]

Processed DataFrame:
                                                Keyword  Source
0     lively, whimsical, amusing, playful, satirical...       1
1                          humorou

In [10]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ... (previous code remains the same)

# Perform logistic regression on the unprocessed DataFrame
vectorizer_unprocessed = CountVectorizer()
X_unprocessed = vectorizer_unprocessed.fit_transform(df_unprocessed['Keyword'])
y_unprocessed = df_unprocessed['Source']

X_train_unprocessed, X_test_unprocessed, y_train_unprocessed, y_test_unprocessed = train_test_split(
    X_unprocessed, y_unprocessed, test_size=0.2, random_state=42)

model_unprocessed = LogisticRegression()
model_unprocessed.fit(X_train_unprocessed, y_train_unprocessed)
y_pred_unprocessed = model_unprocessed.predict(X_test_unprocessed)

accuracy_unprocessed = accuracy_score(y_test_unprocessed, y_pred_unprocessed)

# Output the top 10 coefficients by absolute magnitude for the unprocessed model
feature_names_unprocessed = vectorizer_unprocessed.get_feature_names_out()
coef_unprocessed = model_unprocessed.coef_[0]
top_indices_unprocessed = np.argsort(np.abs(coef_unprocessed))[-10:][::-1]
print("Top 10 Coefficients by Absolute Magnitude (Unprocessed Model):")
for index in top_indices_unprocessed:
    print(f"{feature_names_unprocessed[index]}: {coef_unprocessed[index]}")
print()

# Perform logistic regression on the processed DataFrame
vectorizer_processed = CountVectorizer()
X_processed = vectorizer_processed.fit_transform(df_processed['Keyword'])
y_processed = df_processed['Source']

X_train_processed, X_test_processed, y_train_processed, y_test_processed = train_test_split(
    X_processed, y_processed, test_size=0.2, random_state=42)

model_processed = LogisticRegression()
model_processed.fit(X_train_processed, y_train_processed)
y_pred_processed = model_processed.predict(X_test_processed)

accuracy_processed = accuracy_score(y_test_processed, y_pred_processed)

# Output the top 10 coefficients by absolute magnitude for the processed model
feature_names_processed = vectorizer_processed.get_feature_names_out()
coef_processed = model_processed.coef_[0]
top_indices_processed = np.argsort(np.abs(coef_processed))[-10:][::-1]
print("Top 10 Coefficients by Absolute Magnitude (Processed Model):")
for index in top_indices_processed:
    print(f"{feature_names_processed[index]}: {coef_processed[index]}")
print()

# Compare the accuracy of unprocessed and processed models
print("Unprocessed Model Accuracy:", accuracy_unprocessed)
print("Processed Model Accuracy:", accuracy_processed)

Top 10 Coefficients by Absolute Magnitude (Unprocessed Model):
choice: -1.327749413058812
humans: 1.2052426635741094
quick: -1.1643989774121504
depends: -1.1641307113028614
alright: -1.153836272070192
result: 1.1529395860460727
conversation: 1.147279785310721
ca: -1.1385399825454838
call: 1.134710847142368
whatever: -1.1278333647451908

Top 10 Coefficients by Absolute Magnitude (Processed Model):
reserved: 1.303971597671775
emotional: 1.2905864879729863
skeptically: -1.261087055920453
request: 1.1939814055543314
certain: 1.179597102230682
better: 1.161258924879242
inapprop: -1.1123469677395215
seriously: 1.1107739100749179
disbelief: -1.0970710205073257
find: 1.0818256981590093

Unprocessed Model Accuracy: 0.5060975609756098
Processed Model Accuracy: 0.5691056910569106
