In [None]:
%pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [1]:
#import this https://huggingface.co/datasets/Hello-SimpleAI/HC3/viewer/all/train?views%5B%5D=all_train&row=13
from datasets import load_dataset

# Load the HC3 dataset
dataset = load_dataset("Hello-SimpleAI/HC3", name="all", split="all")

# Check a sample
print(dataset[0])


{'id': '0', 'question': 'Why is every book I hear about a " NY Times # 1 Best Seller " ? ELI5 : Why is every book I hear about a " NY Times # 1 Best Seller " ? Should n\'t there only be one " # 1 " best seller ? Please explain like I\'m five.', 'human_answers': ['Basically there are many categories of " Best Seller " . Replace " Best Seller " by something like " Oscars " and every " best seller " book is basically an " oscar - winning " book . May not have won the " Best film " , but even if you won the best director or best script , you \'re still an " oscar - winning " film . Same thing for best sellers . Also , IIRC the rankings change every week or something like that . Some you might not be best seller one week , but you may be the next week . I guess even if you do n\'t stay there for long , you still achieved the status . Hence , # 1 best seller .', "If you 're hearing about it , it 's because it was a very good or very well - publicized book ( or both ) , and almost every good 

In [2]:
import pandas as pd

df = dataset.to_pandas()
print(df.head())

  id                                           question  \
0  0  Why is every book I hear about a " NY Times # ...   
1  1  If salt is so bad for cars , why do we use it ...   
2  2  Why do we still have SD TV channels when HD lo...   
3  3  Why has nobody assassinated Kim Jong - un He i...   
4  4  How was airplane technology able to advance so...   

                                       human_answers  \
0  [Basically there are many categories of " Best...   
1  [salt is good for not dying in car crashes and...   
2  [The way it works is that old TV stations got ...   
3  [You ca n't just go around assassinating the l...   
4  [Wanting to kill the shit out of Germans drive...   

                                     chatgpt_answers       source  
0  [There are many different best seller lists th...  reddit_eli5  
1  [Salt is used on roads to help melt ice and sn...  reddit_eli5  
2  [There are a few reasons why we still have SD ...  reddit_eli5  
3  [It is generally not acceptable o

In [3]:
# Create a new dataset by merging human_answers and chatgpt_answers
merged_data = pd.concat([
    pd.DataFrame({'question': df['question'], 'answer': df['human_answers'], 'type': 'human'}),
    pd.DataFrame({'question': df['question'], 'answer': df['chatgpt_answers'], 'type': 'gpt'})
], ignore_index=True)

# Remove the 'source' column (already excluded in the new dataset)
print(merged_data.head())

                                            question  \
0  Why is every book I hear about a " NY Times # ...   
1  If salt is so bad for cars , why do we use it ...   
2  Why do we still have SD TV channels when HD lo...   
3  Why has nobody assassinated Kim Jong - un He i...   
4  How was airplane technology able to advance so...   

                                              answer   type  
0  [Basically there are many categories of " Best...  human  
1  [salt is good for not dying in car crashes and...  human  
2  [The way it works is that old TV stations got ...  human  
3  [You ca n't just go around assassinating the l...  human  
4  [Wanting to kill the shit out of Germans drive...  human  


In [4]:
merged_data.head()

Unnamed: 0,question,answer,type
0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",human
1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,human
2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,human
3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,human
4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,human


In [5]:
# Data distribution
merged_data['type'].value_counts()

type
human    24322
gpt      24322
Name: count, dtype: int64

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

df = merged_data.copy()

# Combine the array of answers into a single string for each row
df['answer'] = df['answer'].apply(lambda x: ' '.join(x))

# Extract features and labels
X = df['answer']
y = df['type']

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [7]:
svm_model = SVC(kernel='linear', random_state=42)

In [14]:
# Train an SVM classifier

svm_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         gpt       0.95      0.98      0.96      4904
       human       0.98      0.95      0.96      4825

    accuracy                           0.96      9729
   macro avg       0.96      0.96      0.96      9729
weighted avg       0.96      0.96      0.96      9729



In [20]:
#Save SVM model
import joblib
joblib.dump(svm_model, 'svm_model.pkl')

['svm_model.pkl']

In [21]:
# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [22]:
# Load the saved model
loaded_model = joblib.load('svm_model.pkl')
# Load the TF-IDF vectorizer
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')


In [18]:
# For the question: "What are the benefits of exercise?"

In [23]:
test1 = ["Honestly, exercise has so many upsides. It keeps your heart healthy, helps you stay in shape, and can even boost your mood when you're having a rough day. I’ve noticed that when I work out regularly, I sleep better and feel less stressed. Plus, it’s a great way to clear your head if you’ve been stuck inside staring at a screen all day. Even a short walk can make a huge difference."]
vector_test1 = loaded_vectorizer.transform(test1)
loaded_model.predict(vector_test1)

array(['human'], dtype=object)

In [24]:
test2 = ["Exercise offers numerous benefits for both physical and mental health. Regular physical activity can help improve cardiovascular health, strengthen muscles, and enhance flexibility. It also supports weight management and reduces the risk of chronic diseases such as diabetes and hypertension. Additionally, exercise is known to boost mood, reduce stress, and improve sleep quality. Incorporating consistent exercise into your daily routine can lead to a healthier and more balanced lifestyle"]
vector_test1 = loaded_vectorizer.transform(test2)
loaded_model.predict(vector_test1)

array(['gpt'], dtype=object)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print(classification_report(y_test, y_pred_best))