In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("drug_consumption.csv")

In [3]:
df = data.copy()

In [4]:
df = df.drop(columns=['ID'])

In [5]:
df.columns

Index(['Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'Nscore',
       'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsive', 'SS', 'Alcohol',
       'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack',
       'Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms',
       'Nicotine', 'Semer', 'VSA'],
      dtype='object')

In [6]:
features = df.select_dtypes(include=['number']).columns
labels = df.select_dtypes(include=['object']).columns

In [7]:
#{i:len(df[i].value_counts()) for i in num_cols}
cat_cols = [i for i in features if len(df[i].value_counts()) < 10]
cat_cols

['Age', 'Gender', 'Education', 'Country', 'Ethnicity']

### Age

In [8]:
start = 24
temp = [[18 if i == 0 else start + 10*(i-1) + 1, start + 10*i] for i in range(5)]
age_groups = [f"{i[0]} - {i[1]}" for i in temp]
age_groups.append("65+")
Age_map = {j:age_groups[i] for i, j in enumerate(sorted(df[cat_cols[0]].unique()))}
Age_map

{-0.95197: '18 - 24',
 -0.07854: '25 - 34',
 0.49788: '35 - 44',
 1.09449: '45 - 54',
 1.82213: '55 - 64',
 2.59171: '65+'}

### Gender

In [9]:
Gender_map = dict(zip(df[cat_cols[1]].unique(), ["Female", "Male"]))
Gender_map

{0.48246: 'Female', -0.48246: 'Male'}

### Education

In [10]:
education_groups = ["Left School Before 16 years",
"Left School at 16 years",
"Left School at 17 years",
"Left School at 18 years",
"Some College,No Certificate Or Degree",
"Professional Certificate/ Diploma",
"University Degree",
"Masters Degree",
"Doctorate Degree"]
Education_map = dict(zip(sorted(df[cat_cols[2]].unique()), education_groups))
Education_map

{-2.43591: 'Left School Before 16 years',
 -1.7379: 'Left School at 16 years',
 -1.43719: 'Left School at 17 years',
 -1.22751: 'Left School at 18 years',
 -0.61113: 'Some College,No Certificate Or Degree',
 -0.05921: 'Professional Certificate/ Diploma',
 0.45468: 'University Degree',
 1.16365: 'Masters Degree',
 1.98437: 'Doctorate Degree'}

### Country

In [11]:
Country_map = {-0.09765 : "Australia",
0.24923 : "Canada",
-0.46841 : "New Zealan",
-0.28519 : "Other",
0.21128 : "Republic of Ireland",
0.96082 : "UK",
-0.57009 : "USA"}
Country_map

{-0.09765: 'Australia',
 0.24923: 'Canada',
 -0.46841: 'New Zealan',
 -0.28519: 'Other',
 0.21128: 'Republic of Ireland',
 0.96082: 'UK',
 -0.57009: 'USA'}

### Ethnicity

In [12]:
Ethnicity_map  = {-0.50212 : "Asian",
-1.10702 : "Black",
1.90725 : "Mixed-Black/Asian",
0.12600 : "Mixed-White/Asian",
-0.22166 : "Mixed-White/Black",
0.11440 : "Other",
-0.31685 : "White"}
Ethnicity_map

{-0.50212: 'Asian',
 -1.10702: 'Black',
 1.90725: 'Mixed-Black/Asian',
 0.126: 'Mixed-White/Asian',
 -0.22166: 'Mixed-White/Black',
 0.1144: 'Other',
 -0.31685: 'White'}

In [13]:
Label_map = {"CL0" : "Never Used",
"CL1" : "Used over a Decade Ago",
"CL2" : "Used in Last Decade",
"CL3" : "Used in Last Year",
"CL4" : "Used in Last Month",
"CL5" : "Used in Last Week",
"CL6" : "Used in Last Day"}
Label_map

{'CL0': 'Never Used',
 'CL1': 'Used over a Decade Ago',
 'CL2': 'Used in Last Decade',
 'CL3': 'Used in Last Year',
 'CL4': 'Used in Last Month',
 'CL5': 'Used in Last Week',
 'CL6': 'Used in Last Day'}

In [14]:
maps = [Age_map, Gender_map, Education_map, Country_map, Ethnicity_map]

for i, j in enumerate(maps):
    df[cat_cols[i]] = df[cat_cols[i]].map(j)

In [15]:
df

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,35 - 44,Female,Professional Certificate/ Diploma,UK,Mixed-White/Asian,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,25 - 34,Male,Doctorate Degree,UK,White,-0.67825,1.93886,1.43533,0.76096,-0.14277,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,35 - 44,Male,Professional Certificate/ Diploma,UK,White,-0.46725,0.80523,-0.84732,-1.62090,-1.01450,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,18 - 24,Female,Masters Degree,UK,White,-0.14882,-0.80615,-0.01928,0.59042,0.58489,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,35 - 44,Female,Doctorate Degree,UK,White,0.73545,-1.63340,-0.45174,-0.30172,1.30612,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,18 - 24,Female,"Some College,No Certificate Or Degree",USA,White,-1.19430,1.74091,1.88511,0.76096,-1.13788,...,CL0,CL0,CL0,CL3,CL3,CL0,CL0,CL0,CL0,CL5
1881,18 - 24,Male,"Some College,No Certificate Or Degree",USA,White,-0.24649,1.74091,0.58331,0.76096,-1.51840,...,CL2,CL0,CL0,CL3,CL5,CL4,CL4,CL5,CL0,CL0
1882,25 - 34,Female,University Degree,USA,White,1.13281,-1.37639,-1.27553,-1.77200,-1.38502,...,CL4,CL0,CL2,CL0,CL2,CL0,CL2,CL6,CL0,CL0
1883,18 - 24,Female,"Some College,No Certificate Or Degree",USA,White,0.91093,-1.92173,0.29338,-1.62090,-2.57309,...,CL3,CL0,CL0,CL3,CL3,CL0,CL3,CL4,CL0,CL0


In [17]:
df.dtypes

Age           object
Gender        object
Education     object
Country       object
Ethnicity     object
Nscore       float64
Escore       float64
Oscore       float64
Ascore       float64
Cscore       float64
Impulsive    float64
SS           float64
Alcohol       object
Amphet        object
Amyl          object
Benzos        object
Caff          object
Cannabis      object
Choc          object
Coke          object
Crack         object
Ecstasy       object
Heroin        object
Ketamine      object
Legalh        object
LSD           object
Meth          object
Mushrooms     object
Nicotine      object
Semer         object
VSA           object
dtype: object

In [18]:
df.to_csv("drug.csv")