In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score

# Sample data (replace with your actual data loading method)


df = pd.read_csv('/content/train.csv')

# Display the first few rows
print(df.head())

# Step 2: Inspect Data Types
print("Data types before encoding:")
print(df.dtypes)

# Step 3: Encode Categorical Variables
# Label Encoding for binary columns
label_encoders = {}
binary_columns = ['gender', 'jaundice', 'austim', 'used_app_before', 'relation', 'age_desc']

for col in binary_columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# One-Hot Encoding for non-binary columns
df = pd.get_dummies(df, columns=['ethnicity', 'contry_of_res'])

# Display the first few rows after encoding
print("Data after encoding:")
print(df.head())

# Define features and target
X = df.drop(columns=['ID', 'Class/ASD'])  # Drop ID and target columns
y = df['Class/ASD']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model
model = svm.SVC()
model.fit(X_train, y_train)

# Predict
x_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

print('Accuracy score of the training data : ', training_data_accuracy)

x_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

print('Accuracy score of the test data : ', test_data_accuracy)

# Sample new input data
input_data = ['f', 1, 0, 1, 0, 1, 0, 1, 0, 1, '?', 'no', 'no', 'United States', 'no', 14.851484, '18 and more', 'Self']

# Define the same label encoders used during training
label_encoders = {
    'gender': LabelEncoder().fit(['f', 'm']),
    'ethnicity': LabelEncoder().fit(['?', 'White-European']),
    'jaundice': LabelEncoder().fit(['no', 'yes']),
    'austim': LabelEncoder().fit(['no', 'yes']),
    'used_app_before': LabelEncoder().fit(['no', 'yes']),
    'age_desc': LabelEncoder().fit(['18 and more']),
    'relation': LabelEncoder().fit(['Self'])
}

# Encode the input data
encoded_input_data = [
    label_encoders['gender'].transform([input_data[0]])[0],
    input_data[1],
    input_data[2],
    input_data[3],
    input_data[4],
    input_data[5],
    input_data[6],
    input_data[7],
    input_data[8],
    input_data[9],
    label_encoders['ethnicity'].transform([input_data[10]])[0],
    label_encoders['jaundice'].transform([input_data[11]])[0],
    label_encoders['austim'].transform([input_data[12]])[0],
    label_encoders['used_app_before'].transform([input_data[14]])[0],
    input_data[15],
    label_encoders['age_desc'].transform([input_data[16]])[0],
    label_encoders['relation'].transform([input_data[17]])[0]
]

# One-Hot Encoding for 'contry_of_res'
one_hot_encoded_country = pd.get_dummies(['Austria', 'India', 'United States', 'South Africa']).reindex(columns=['Austria', 'India', 'United States', 'South Africa']).fillna(0)
country_vector = one_hot_encoded_country.loc[:, input_data[13]].values.flatten()

# Combine encoded data and one-hot encoded country vector
complete_encoded_input = encoded_input_data[:13] + country_vector.tolist() + encoded_input_data[13:]

# Reshape the input data
input_data_as_numpy_array = np.asarray(complete_encoded_input)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Make the prediction
prediction = model.predict(input_data_reshaped)
print(prediction)



   ID  A1_Score  A2_Score  A3_Score  A4_Score  A5_Score  A6_Score  A7_Score  \
0   1         1         0         1         0         1         0         1   
1   2         0         0         0         0         0         0         0   
2   3         1         1         1         1         1         1         1   
3   4         0         0         0         0         0         0         0   
4   5         0         0         0         0         0         0         0   

   A8_Score  A9_Score  ...  gender       ethnicity jaundice austim  \
0         0         1  ...       f               ?       no     no   
1         0         0  ...       m               ?       no     no   
2         1         1  ...       m  White-European       no    yes   
3         0         0  ...       f               ?       no     no   
4         0         0  ...       m               ?       no     no   

   contry_of_res used_app_before     result     age_desc  relation Class/ASD  
0        Austria         



ValueError: X has 21 features, but SVC is expecting 86 features as input.