In [None]:
import pandas as pd
import numpy as np

In [None]:
file_path = "/content/adult.data"

In [None]:
data = pd.read_csv(file_path)

In [None]:
data.shape

(32560, 15)

In [None]:
columns = data.columns
columns

Index(['39', ' State-gov', ' 77516', ' Bachelors', ' 13', ' Never-married',
       ' Adm-clerical', ' Not-in-family', ' White', ' Male', ' 2174', ' 0',
       ' 40', ' United-States', ' <=50K'],
      dtype='object')

In [None]:
data = data[(data != ' ?').all(axis=1)]
data.head(5)


Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [None]:
data.shape

(30161, 15)

In [None]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 30161 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   39              30161 non-null  int64 
 1    State-gov      30161 non-null  object
 2    77516          30161 non-null  int64 
 3    Bachelors      30161 non-null  object
 4    13             30161 non-null  int64 
 5    Never-married  30161 non-null  object
 6    Adm-clerical   30161 non-null  object
 7    Not-in-family  30161 non-null  object
 8    White          30161 non-null  object
 9    Male           30161 non-null  object
 10   2174           30161 non-null  int64 
 11   0              30161 non-null  int64 
 12   40             30161 non-null  int64 
 13   United-States  30161 non-null  object
 14   <=50K          30161 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [None]:
data["income_less"] = data[" <=50K"].apply(lambda x: 0 if x == " <=50K" else 1)

In [None]:
data = data.drop(columns=" <=50K")
data.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,income_less
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
data_encoded = pd.get_dummies(data.drop('income_less', axis=1))

In [None]:
data_encoded.head()

Unnamed: 0,39,77516,13,2174,0,40,State-gov_ Federal-gov,State-gov_ Local-gov,State-gov_ Private,State-gov_ Self-emp-inc,...,United-States_ Portugal,United-States_ Puerto-Rico,United-States_ Scotland,United-States_ South,United-States_ Taiwan,United-States_ Thailand,United-States_ Trinadad&Tobago,United-States_ United-States,United-States_ Vietnam,United-States_ Yugoslavia
0,50,83311,13,0,0,13,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,38,215646,9,0,0,40,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
2,53,234721,7,0,0,40,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
3,28,338409,13,0,0,40,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,37,284582,14,0,0,40,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False


In [None]:
X = data_encoded
y = data['income_less']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=104)  # Retain 95% of the variance
X_pca = pca.fit_transform(X_scaled)

print(f"Original number of features: {X_scaled.shape[1]}")
print(f"Number of features after PCA: {X_pca.shape[1]}")

Original number of features: 104
Number of features after PCA: 104


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Initialize and train the SVM classifier
svm_clf = SVC(kernel='linear', random_state=42)
svm_clf.fit(X_train, y_train)

# Make predictions
y_pred = svm_clf.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.84

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.93      0.90      4535
           1       0.74      0.58      0.65      1498

    accuracy                           0.84      6033
   macro avg       0.80      0.76      0.78      6033
weighted avg       0.84      0.84      0.84      6033



In [None]:
# Get absolute coefficients for importance
feature_importance = np.abs(svm_clf.coef_[0])

# Create DataFrame of features and importance
feat_imp_df = pd.DataFrame({
    "feature": X.columns,
    "importance": feature_importance
})

# Sort and pick top-k
k = 17
top_k = feat_imp_df.sort_values(by="importance", ascending=False).head(k)

print(top_k)

(104,)
                     feature  importance
79       United-States_ Hong    0.813976
0                         39    0.696048
86      United-States_ Japan    0.540753
1                      77516    0.496925
18        Bachelors_ 7th-8th    0.479083
82       United-States_ Iran    0.462735
80    United-States_ Hungary    0.455542
9    State-gov_ Self-emp-inc    0.388634
14           Bachelors_ 11th    0.351486
7       State-gov_ Local-gov    0.325899
5                         40    0.298204
74     United-States_ Greece    0.280767
72     United-States_ France    0.278463
73    United-States_ Germany    0.275361
69    United-States_ Ecuador    0.254934
23      Bachelors_ Doctorate    0.245027
11      State-gov_ State-gov    0.243405


In [None]:
mapper = {}
svm_clf = SVC(kernel='rbf', random_state=42)
for n in range(1,10):
  pca = PCA(n_components=n/10)
  X_pca = pca.fit_transform(X_scaled)
  X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
  svm_clf.fit(X_train, y_train)
  y_pred = svm_clf.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  mapper[X_pca.shape[1]] = round(accuracy,2)
  print("Done", X_pca.shape[1], accuracy)

Done 4 0.813691364163766
Done 9 0.8246311950936516
Done 17 0.8286093154317918
Done 27 0.8354052710094481
Done 37 0.8340792308967346
Done 47 0.8347422509530913
Done 57 0.8340792308967346
Done 68 0.8359025360517156
Done 79 0.8402121664180342
