Generating Synthetic Dataset using the parameter weightage and probablity distribution





In [2]:
import numpy as np
import pandas as pd

# Number of samples
n_samples = 1000

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data with controlled ranges

# Wealth_Score: Between 1 and 10
wealth_score = np.random.randint(3, 11, n_samples)

# Rank_Progression: Time spent in ranks, 1 to 30 years
rank_progression = np.random.randint(12, 360, n_samples) / 12  # Convert months to years

# Mission_Success: Between 50 and 100 (as percentage)
mission_success = np.random.randint(50, 101, n_samples)

# Peer_Reviews: Between 3 and 5 (teamwork, trust rating)
peer_reviews = np.random.randint(3, 6, n_samples)

# Attendance_Records: Between 50% to 100%
attendance_records = np.random.randint(50, 101, n_samples)

# Disciplinary_Actions: Biased distribution with more zeros
disciplinary_actions = np.random.binomial(n=10, p=0.2, size=n_samples)

# Sentiment_Score: Between 0 (negative) and 1 (positive)
sentiment_score = np.random.uniform(0, 1, n_samples)

# Proximity_to_Enemy: Binary (0 = far, 1 = near)
proximity_to_enemy = np.random.randint(0, 2, n_samples)

# Propaganda_Influence: Between 0 and 4
propaganda_influence = np.random.randint(0, 5, n_samples)

# Loyalty_History: Binary (0 = no past betrayal, 1 = past betrayal)
loyalty_history = np.random.randint(0, 2, n_samples)

# Now generate the 'Betrayal' column based on certain conditions
# Start with a low base probability for betrayal
prob_betrayal = np.zeros(n_samples)

# Lower the weights to reduce the number of cases where betrayal occurs
prob_betrayal += (wealth_score <= 3) * 0.1  # Reduce wealth score weight
prob_betrayal += (mission_success <= 60) * 0.05  # Reduce mission success weight
prob_betrayal += (peer_reviews <= 3) * 0.05  # Reduce peer review weight
prob_betrayal += (attendance_records <= 60) * 0.05  # Reduce attendance weight

# Keep other factors with slight influence
prob_betrayal += (propaganda_influence >= 3) * 0.1  # Slight influence of propaganda
prob_betrayal += (proximity_to_enemy == 1) * 0.05  # Slight proximity influence
prob_betrayal += (loyalty_history == 1) * 0.1  # Slight influence of past betrayal
prob_betrayal += (disciplinary_actions >= 3) * 0.05  # Keep disciplinary actions low

# Clip probabilities to stay between 0 and 0.2 (further reduce betrayal probability)
prob_betrayal = np.clip(prob_betrayal, 0, 0.2)

# Generate the final 'Betrayal' values based on the probabilities
betrayal = np.random.binomial(1, prob_betrayal)

# Combine all the features into a DataFrame
df = pd.DataFrame({
    'Wealth_Score': wealth_score,
    'Rank_Progression_Years': rank_progression,
    'Mission_Success': mission_success,
    'Peer_Reviews': peer_reviews,
    'Attendance_Records': attendance_records,
    'Disciplinary_Actions': disciplinary_actions,
    'Sentiment_Score': sentiment_score,
    'Proximity_to_Enemy': proximity_to_enemy,
    'Propaganda_Influence': propaganda_influence,
    'Loyalty_History': loyalty_history,
    'Betrayal': betrayal
})

# Display the distribution of 'Betrayal'
print(df['Betrayal'].value_counts(normalize=True))

# Display the first few rows of the dataset
print(df.head())


Betrayal
0    0.836
1    0.164
Name: proportion, dtype: float64
   Wealth_Score  Rank_Progression_Years  Mission_Success  Peer_Reviews  \
0             9               13.000000               61             5   
1             6               17.666667               67             5   
2             7               18.583333               64             3   
3             9               19.250000               93             3   
4             5               20.916667               65             5   

   Attendance_Records  Disciplinary_Actions  Sentiment_Score  \
0                  90                     2         0.118219   
1                  73                     1         0.000031   
2                  84                     3         0.712137   
3                  79                     2         0.356596   
4                  79                     2         0.254481   

   Proximity_to_Enemy  Propaganda_Influence  Loyalty_History  Betrayal  
0                   1            

In [9]:
df.head()

Unnamed: 0,Wealth_Score,Rank_Progression_Years,Mission_Success,Peer_Reviews,Attendance_Records,Disciplinary_Actions,Sentiment_Score,Proximity_to_Enemy,Propaganda_Influence,Loyalty_History,Betrayal
0,9,13.0,61,5,90,2,0.118219,1,4,1,0
1,6,17.666667,67,5,73,1,3.1e-05,0,4,0,0
2,7,18.583333,64,3,84,3,0.712137,1,1,1,0
3,9,19.25,93,3,79,2,0.356596,1,1,0,0
4,5,20.916667,65,5,79,2,0.254481,0,3,0,0


In [5]:
count_zero_betrayal = df['Betrayal'].value_counts().get(0, 0)
print(f"Number of 0s in Betrayal column: {count_zero_betrayal}")


Number of 0s in Betrayal column: 836


Exporting the Synthetic Dataset as .csv fiel

In [8]:
# Export the DataFrame to a CSV file
df.to_csv('synthetic_betrayal_data.csv', index=False)

# Verify the file has been saved
print("Dataset successfully exported as 'synthetic_betrayal_data.csv'")


Dataset successfully exported as 'synthetic_betrayal_data.csv'


Preprocessing and Model building

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Preprocessing

# Feature matrix X and target vector y
X = df.drop(columns=['Betrayal'])  # All features except the target
y = df['Betrayal']  # Target variable

# Train-test split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the numerical features (since they are on different scales)
scaler = StandardScaler()

# Fit the scaler on training data and transform both train and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Building

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)

# Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_scaled, y_train)
y_pred_rf = rf_clf.predict(X_test_scaled)

# Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train_scaled, y_train)
y_pred_gb = gb_clf.predict(X_test_scaled)

# Model Evaluation Function

def evaluate_model(y_test, y_pred, model_name):
    print(f"Evaluation of {model_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Evaluate Logistic Regression
evaluate_model(y_test, y_pred_log_reg, "Logistic Regression")

# Evaluate Random Forest
evaluate_model(y_test, y_pred_rf, "Random Forest")

# Evaluate Gradient Boosting
evaluate_model(y_test, y_pred_gb, "Gradient Boosting")

# Hyperparameter Tuning for Random Forest (optional to boost accuracy)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search_rf = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train_scaled, y_train)

# Best parameters for Random Forest
print("Best Random Forest Params:", grid_search_rf.best_params_)

# Make predictions with the tuned Random Forest model
y_pred_rf_tuned = grid_search_rf.best_estimator_.predict(X_test_scaled)

# Evaluate the tuned Random Forest
evaluate_model(y_test, y_pred_rf_tuned, "Tuned Random Forest")

# Check if we are getting 90% accuracy with the tuned model


Evaluation of Logistic Regression:
Accuracy: 0.83
Confusion Matrix:
 [[166   0]
 [ 34   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91       166
           1       0.00      0.00      0.00        34

    accuracy                           0.83       200
   macro avg       0.41      0.50      0.45       200
weighted avg       0.69      0.83      0.75       200

Evaluation of Random Forest:
Accuracy: 0.83
Confusion Matrix:
 [[166   0]
 [ 34   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91       166
           1       0.00      0.00      0.00        34

    accuracy                           0.83       200
   macro avg       0.41      0.50      0.45       200
weighted avg       0.69      0.83      0.75       200

Evaluation of Gradient Boosting:
Accuracy: 0.81
Confusion Matrix:
 [[161   5]
 [ 33   1]]
Classification Report:
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Random Forest Params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Evaluation of Tuned Random Forest:
Accuracy: 0.83
Confusion Matrix:
 [[166   0]
 [ 34   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91       166
           1       0.00      0.00      0.00        34

    accuracy                           0.83       200
   macro avg       0.41      0.50      0.45       200
weighted avg       0.69      0.83      0.75       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
