In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox

In [6]:

file1 = pd.read_csv('Data/file1.csv')
file2 = pd.read_csv('Data/file2.csv')
file3 = pd.read_csv('Data/file3.csv')


# Standardizing header names

In [7]:
# to standarizing the header names of the columns, we need every single column to have same name

def rename_columns(file3):
    file3.rename(columns={'State':'ST','Gender':'GENDER'}, inplace=True )
    return file3

rename_columns(file3)

Unnamed: 0,Customer,ST,Customer Lifetime Value,Education,GENDER,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Total Claim Amount,Vehicle Class
0,SA25987,Washington,3479.137523,High School or Below,M,0,104,0,Personal Auto,499.200000,Two-Door Car
1,TB86706,Arizona,2502.637401,Master,M,0,66,0,Personal Auto,3.468912,Two-Door Car
2,ZL73902,Nevada,3265.156348,Bachelor,F,25820,82,0,Personal Auto,393.600000,Four-Door Car
3,KX23516,California,4455.843406,High School or Below,F,0,121,0,Personal Auto,699.615192,SUV
4,FN77294,California,7704.958480,High School or Below,M,30366,101,2,Personal Auto,484.800000,SUV
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,23405.987980,Bachelor,M,71941,73,0,Personal Auto,198.234764,Four-Door Car
7066,PK87824,California,3096.511217,College,F,21604,79,0,Corporate Auto,379.200000,Four-Door Car
7067,TD14365,California,8163.890428,Bachelor,M,0,85,3,Corporate Auto,790.784983,Four-Door Car
7068,UP19263,California,7524.442436,College,M,21941,96,0,Personal Auto,691.200000,Four-Door Car


In [8]:
# conactenation of the files in one file called auto_data
auto_data=pd.concat([file1,file2,file3], axis=0)
auto_data

Unnamed: 0,Customer,ST,GENDER,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [9]:
# here we convert all columns names in lower case.

def lower_case_auto_data(auto_data):
    auto_data.columns=[i.lower() for i in auto_data.columns]
    return auto_data

lower_case_auto_data(auto_data)

Unnamed: 0,customer,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


# Deleting and rearranging columns – delete the column customer as it is only a unique identifier for each row of data

In [10]:
# Let's delete the 'customer' column, we don't need it to analyze the data
auto_data.drop(columns=["customer"], inplace=True)
auto_data

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [11]:
#Rearrangin columns
#Here we see all colums in data frame.
list(auto_data.columns)

['st',
 'gender',
 'education',
 'customer lifetime value',
 'income',
 'monthly premium auto',
 'number of open complaints',
 'policy type',
 'vehicle class',
 'total claim amount']

In [12]:
#We exchanged the order of income and costumer lifetime value columms

auto_data=auto_data[['st',
 'gender',
 'education',
 'income',
 'customer lifetime value',
 'monthly premium auto',
 'number of open complaints',
 'policy type',
 'vehicle class',
 'total claim amount']]
auto_data

Unnamed: 0,st,gender,education,income,customer lifetime value,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,0.0,,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,0.0,697953.59%,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,48767.0,1288743.17%,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,0.0,764586.18%,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,36357.0,536307.65%,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,71941.0,23405.98798,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,21604.0,3096.511217,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,0.0,8163.890428,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,21941.0,7524.442436,96.0,0,Personal Auto,Four-Door Car,691.200000


# Working with data types – Check the data types of all the columns and fix the incorrect ones (for ex. customer lifetime value and number of open complaints ). Hint: remove the percentage from the customer lifetime value and truncate it to an integer value.

In [13]:
auto_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12074 entries, 0 to 7069
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   st                         9137 non-null   object 
 1   gender                     9015 non-null   object 
 2   education                  9137 non-null   object 
 3   income                     9137 non-null   float64
 4   customer lifetime value    9130 non-null   object 
 5   monthly premium auto       9137 non-null   float64
 6   number of open complaints  9137 non-null   object 
 7   policy type                9137 non-null   object 
 8   vehicle class              9137 non-null   object 
 9   total claim amount         9137 non-null   float64
dtypes: float64(3), object(7)
memory usage: 1.0+ MB


In [14]:
#We use Lambda function + isinstance in due to delete the % simbol in the Customer lifetime value column
auto_data["customer lifetime value"] = auto_data["customer lifetime value"].apply(lambda x: float (x.strip("%"))/100 if isinstance(x,str) else x)

auto_data

Unnamed: 0,st,gender,education,income,customer lifetime value,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,0.0,,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,0.0,6979.535900,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,48767.0,12887.431700,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,0.0,7645.861800,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,36357.0,5363.076500,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,71941.0,23405.987980,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,21604.0,3096.511217,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,0.0,8163.890428,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,21941.0,7524.442436,96.0,0,Personal Auto,Four-Door Car,691.200000


In [15]:
auto_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12074 entries, 0 to 7069
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   st                         9137 non-null   object 
 1   gender                     9015 non-null   object 
 2   education                  9137 non-null   object 
 3   income                     9137 non-null   float64
 4   customer lifetime value    9130 non-null   float64
 5   monthly premium auto       9137 non-null   float64
 6   number of open complaints  9137 non-null   object 
 7   policy type                9137 non-null   object 
 8   vehicle class              9137 non-null   object 
 9   total claim amount         9137 non-null   float64
dtypes: float64(4), object(6)
memory usage: 1.0+ MB


In [16]:
auto_data["number of open complaints"] = auto_data["number of open complaints"].apply(lambda x: x.split("/")[1] if isinstance(x,str) else x)
auto_data
                                                                        

Unnamed: 0,st,gender,education,income,customer lifetime value,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,0.0,,1000.0,0,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,0.0,6979.535900,94.0,0,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,48767.0,12887.431700,108.0,0,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,0.0,7645.861800,106.0,0,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,36357.0,5363.076500,68.0,0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,71941.0,23405.987980,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,21604.0,3096.511217,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,0.0,8163.890428,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,21941.0,7524.442436,96.0,0,Personal Auto,Four-Door Car,691.200000


# Filtering data and Correcting typos – Filter the data in state and gender column to standardize the texts in those columns

In [17]:
#Filter the data in state. 
auto_data.st.unique()

array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', 'Cali',
       'AZ', 'WA', nan], dtype=object)

In [18]:
#We use .replace()
auto_data["st"]= auto_data["st"].replace("AZ","Arizona") 
auto_data["st"] = auto_data["st"].replace("Cali","California")
auto_data["st"] = auto_data["st"].replace("WA","Washington")
auto_data.st.unique()

array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', nan],
      dtype=object)

In [19]:
#Filter the data in gender, again: .replace()
#First, we need to know whitch variety of values in gender column
auto_data.gender.unique()

array([nan, 'F', 'M', 'Femal', 'Male', 'female'], dtype=object)

In [20]:
#We only want, F and M. Again, .replace()


auto_data["gender"]=auto_data["gender"].replace("Femal","F")
auto_data["gender"]=auto_data["gender"].replace("female","F")
auto_data["gender"]=auto_data["gender"].replace("Male","M")

auto_data.gender.unique()

array([nan, 'F', 'M'], dtype=object)

In [21]:
#we change NaN values for "U"(Unknown), we use:
auto_data['gender']=auto_data['gender'].fillna(value="U")

auto_data.gender.unique()

array(['U', 'F', 'M'], dtype=object)

In [22]:
auto_data['st']=auto_data['st'].fillna(value="U")

auto_data.st.unique()

array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', 'U'],
      dtype=object)

In [23]:
#In customer lifetime value we can not use "U" because the values in this column are numbers. 
#so we replace the NaN for the mean of the column
#first I define a variable with the value of the mean of the customer lifetime value. I use this variable
#replace NaN of the column

ad1_mean=auto_data["customer lifetime value"].mean()

auto_data['customer lifetime value']=auto_data['customer lifetime value'].fillna(value=ad1_mean)

auto_data

#Applying directly the mean function instead the a variable with the mean value
#auto_data[‘customer lifetime value’]=auto_data[‘customer lifetime value’].fillna(auto_data[‘customer lifetime value’].mean())

Unnamed: 0,st,gender,education,income,customer lifetime value,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,U,Master,0.0,7977.057704,1000.0,0,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,0.0,6979.535900,94.0,0,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,48767.0,12887.431700,108.0,0,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,0.0,7645.861800,106.0,0,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,36357.0,5363.076500,68.0,0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,71941.0,23405.987980,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,21604.0,3096.511217,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,0.0,8163.890428,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,21941.0,7524.442436,96.0,0,Personal Auto,Four-Door Car,691.200000


In [24]:
#in income column we can leave 0.0 values. As "0.0" is not a NaN, we use replace() instead fillna:
#In this case I apply the .mean() function instead a variable with the value of the mean
auto_data['income']=auto_data['income'].replace(0,auto_data["income"].mean())
auto_data["income"]

0       37828.820291
1       37828.820291
2       48767.000000
3       37828.820291
4       36357.000000
            ...     
7065    71941.000000
7066    21604.000000
7067    37828.820291
7068    21941.000000
7069    37828.820291
Name: income, Length: 12074, dtype: float64

# Removing duplicates

In [25]:
#For removing duplicates rows. we use this function

auto_data = auto_data.drop_duplicates()

auto_data

Unnamed: 0,st,gender,education,income,customer lifetime value,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,U,Master,37828.820291,7977.057704,1000.0,0,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,37828.820291,6979.535900,94.0,0,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,48767.000000,12887.431700,108.0,0,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,37828.820291,7645.861800,106.0,0,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,36357.000000,5363.076500,68.0,0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,71941.000000,23405.987980,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,21604.000000,3096.511217,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,37828.820291,8163.890428,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,21941.000000,7524.442436,96.0,0,Personal Auto,Four-Door Car,691.200000


# Bucketing the data - Write a function to replace column "State" to different zones. California as West Region, Oregon as North West, and Washington as East, and Arizona and Nevada as Central

In [26]:
#California=West Region
#Oregon=North West
#Washington=East
#Arizona and Nevada=Central



In [27]:
auto_data["st"]= auto_data["st"].replace(["California","Oregon","Washington","Arizona"],["West Region","North West","East","Central"])
auto_data["st"]= auto_data["st"].replace("Nevada","Central")
auto_data.st.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  auto_data["st"]= auto_data["st"].replace(["California","Oregon","Washington","Arizona"],["West Region","North West","East","Central"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  auto_data["st"]= auto_data["st"].replace("Nevada","Central")


array(['East', 'Central', 'West Region', 'North West', 'U'], dtype=object)

In [28]:
# We use again .rename() to change the name of the 'st' colum for 'region'
auto_data.rename(columns={'st':'region'}, inplace=True )
auto_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  auto_data.rename(columns={'st':'region'}, inplace=True )


Unnamed: 0,region,gender,education,income,customer lifetime value,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,East,U,Master,37828.820291,7977.057704,1000.0,0,Personal Auto,Four-Door Car,2.704934
1,Central,F,Bachelor,37828.820291,6979.535900,94.0,0,Personal Auto,Four-Door Car,1131.464935
2,Central,F,Bachelor,48767.000000,12887.431700,108.0,0,Personal Auto,Two-Door Car,566.472247
3,West Region,M,Bachelor,37828.820291,7645.861800,106.0,0,Corporate Auto,SUV,529.881344
4,East,M,High School or Below,36357.000000,5363.076500,68.0,0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,West Region,M,Bachelor,71941.000000,23405.987980,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,West Region,F,College,21604.000000,3096.511217,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,West Region,M,Bachelor,37828.820291,8163.890428,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,West Region,M,College,21941.000000,7524.442436,96.0,0,Personal Auto,Four-Door Car,691.200000


# (Optional) Standardizing the data – Use string functions to standardize the text data (lower case)


In [29]:
auto_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8876 entries, 0 to 7069
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   region                     8876 non-null   object 
 1   gender                     8876 non-null   object 
 2   education                  8875 non-null   object 
 3   income                     8875 non-null   float64
 4   customer lifetime value    8876 non-null   float64
 5   monthly premium auto       8875 non-null   float64
 6   number of open complaints  8875 non-null   object 
 7   policy type                8875 non-null   object 
 8   vehicle class              8875 non-null   object 
 9   total claim amount         8875 non-null   float64
dtypes: float64(4), object(6)
memory usage: 762.8+ KB


In [30]:
auto_data_obj =auto_data.select_dtypes(include = object)
auto_data_obj

Unnamed: 0,region,gender,education,number of open complaints,policy type,vehicle class
0,East,U,Master,0,Personal Auto,Four-Door Car
1,Central,F,Bachelor,0,Personal Auto,Four-Door Car
2,Central,F,Bachelor,0,Personal Auto,Two-Door Car
3,West Region,M,Bachelor,0,Corporate Auto,SUV
4,East,M,High School or Below,0,Personal Auto,Four-Door Car
...,...,...,...,...,...,...
7065,West Region,M,Bachelor,0,Personal Auto,Four-Door Car
7066,West Region,F,College,0,Corporate Auto,Four-Door Car
7067,West Region,M,Bachelor,3,Corporate Auto,Four-Door Car
7068,West Region,M,College,0,Personal Auto,Four-Door Car


In [54]:
#We use .str.lower() to convert the strings in the dataframe to a lower case.
auto_data_obj['region'] = auto_data['region'].str.lower()


In [44]:
auto_data_obj

Unnamed: 0,region,gender,education,number of open complaints,policy type,vehicle class
0,east,u,Master,0,Personal Auto,Four-Door Car
1,central,f,Bachelor,0,Personal Auto,Four-Door Car
2,central,f,Bachelor,0,Personal Auto,Two-Door Car
3,west region,m,Bachelor,0,Corporate Auto,SUV
4,east,m,High School or Below,0,Personal Auto,Four-Door Car
...,...,...,...,...,...,...
7065,west region,m,Bachelor,0,Personal Auto,Four-Door Car
7066,west region,f,College,0,Corporate Auto,Four-Door Car
7067,west region,m,Bachelor,3,Corporate Auto,Four-Door Car
7068,west region,m,College,0,Personal Auto,Four-Door Car


In [52]:
#as there are multiple columns we define a lambda function and convert the whole dataframe to lower case.
auto_data_obj=auto_data_obj.apply(lambda x: x.str.lower())
auto_data_obj

Unnamed: 0,region,gender,education,number of open complaints,policy type,vehicle class
0,east,u,master,0,personal auto,four-door car
1,central,f,bachelor,0,personal auto,four-door car
2,central,f,bachelor,0,personal auto,two-door car
3,west region,m,bachelor,0,corporate auto,suv
4,east,m,high school or below,0,personal auto,four-door car
...,...,...,...,...,...,...
7065,west region,m,bachelor,,personal auto,four-door car
7066,west region,f,college,,corporate auto,four-door car
7067,west region,m,bachelor,,corporate auto,four-door car
7068,west region,m,college,,personal auto,four-door car
