In [150]:
import numpy as np
import pandas as pd

def load_income_data():
    return pd.read_csv(r"C:\Users\Bharat\Music\ML\label.csv")

In [151]:
income = load_income_data()
income["Income in EUR"] = abs(income["Income in EUR"])

In [161]:
income.head()

Unnamed: 0,Instance,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Hair Color,Body Height [cm]
97546,97547,1998.0,female,15.0,Belarus,790343,ombudsperson,No,0,Brown,168
27380,27381,2001.0,male,55.0,Costa Rica,14906,mediation coordinator,Master,1,Brown,200
96608,96609,1983.0,other,28.0,Qatar,1331438,reporter,Bachelor,1,Blond,209
86131,86132,1997.0,other,24.0,State of Palestine,1098552,recruitment assistant,Bachelor,0,Blond,176
21611,21612,1996.0,male,27.0,Sweden,520247,legal assistant,Bachelor,1,Blond,208


In [153]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(income, test_size=0.2, random_state=42)

In [154]:
income = train_set.drop("Income in EUR", axis=1) # drop labels for training set
income_labels = train_set["Income in EUR"].copy()

In [155]:
sample_incomplete_rows = income[income.isnull().any(axis=1)].head()

In [156]:
median = income["Year of Record"].median()
sample_incomplete_rows["Year of Record"].fillna(median, inplace=True) # option 3

In [157]:
income["Profession"].fillna("Other", inplace = True) 
income["Gender"].fillna("Other", inplace = True) 
income["University Degree"].fillna("Other", inplace = True) 

In [158]:
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

imputer = SimpleImputer(strategy="median")

In [162]:
income_num = income.drop(['Instance', 'Gender','Profession','Country', 'University Degree', 'Hair Color', 'Wears Glasses'], axis=1)
income_num[income_num.isnull().any(axis=1)].fillna(0, inplace=True) # option 3
income_num['Size of City'] = np.log(income_num['Size of City'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [163]:
imputer.fit(income_num)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [164]:
X = imputer.transform(income_num)

In [165]:
income_tr = pd.DataFrame(X, columns=income_num.columns,
                          index=income.index)

In [166]:
income_tr.loc[sample_incomplete_rows.index.values]

Unnamed: 0,Year of Record,Age,Size of City,Body Height [cm]
24233,1986.0,71.0,13.794708,188.0
77954,2004.0,31.0,14.676074,155.0
15551,1999.0,26.0,14.162721,165.0
60235,2013.0,17.0,12.704967,181.0
66420,1989.0,28.0,13.880409,155.0


In [167]:
income_cat = income[['Profession', 'University Degree', 'Gender']]

In [168]:
try:
    from sklearn.preprocessing import OrdinalEncoder
except ImportError:
    from future_encoders import OrdinalEncoder # Scikit-Learn < 0.20

In [169]:
ordinal_encoder = OrdinalEncoder()
income_cat_encoded = ordinal_encoder.fit_transform(income_cat)

In [170]:
ordinal_encoder.categories_

[array(['.net developer', '.net software developer', 'Accountant', ...,
        'x-ray technician', 'yardmaster', 'youth initiatives lead advisor'],
       dtype=object),
 array(['0', 'Bachelor', 'Master', 'No', 'Other', 'PhD'], dtype=object),
 array(['0', 'Other', 'female', 'male', 'other', 'unknown'], dtype=object)]

In [171]:
try:
    from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20
    from sklearn.preprocessing import OneHotEncoder
except ImportError:
    from future_encoders import OneHotEncoder # Scikit-Learn < 0.20

cat_encoder = OneHotEncoder()
income_cat_1hot = cat_encoder.fit_transform(income_cat)
income_cat_1hot

<89594x1343 sparse matrix of type '<class 'numpy.float64'>'
	with 268782 stored elements in Compressed Sparse Row format>

In [172]:
income_cat_1hot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [173]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

income_num_tr = num_pipeline.fit_transform(income_num)

In [174]:
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20

In [175]:
num_attribs = list(income_num)
cat_attribs = ["Profession","University Degree", "Gender"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

income_prepared = full_pipeline.fit_transform(income)

In [176]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
class OldDataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [177]:
num_attribs = list(income_num)
cat_attribs = ["Profession", "University Degree", "Gender"]

old_num_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

old_cat_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

In [178]:
from sklearn.pipeline import FeatureUnion

old_full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", old_num_pipeline),
        ("cat_pipeline", old_cat_pipeline),
    ])

In [179]:
old_income_prepared = old_full_pipeline.fit_transform(income)
old_income_prepared

array([[-0.12380589, -1.393303  , -0.01935924, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.13597626,  1.10261245, -0.38345652, ...,  1.        ,
         0.        ,  0.        ],
       [-1.42271659, -0.58213048,  0.23470552, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.03721184,  1.47699977,  0.07848171, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.34829292,  0.79062302,  0.51029579, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17510482,  0.72822513,  0.01908373, ...,  0.        ,
         0.        ,  0.        ]])

In [180]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(income_prepared, np.absolute(income_labels))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [181]:
from sklearn.metrics import mean_squared_error

income_predictions = lin_reg.predict(old_income_prepared)
lin_mse = mean_squared_error(income_labels, income_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

141773.6076868273

In [182]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(income_labels, income_predictions)
lin_mae

85966.61895269905

In [183]:
income1 =pd.read_csv(r"C:\Users\Bharat\Music\ML\Nolabel.csv")

income1["Profession"].fillna("Other", inplace = True) 
income1["Gender"].fillna("Other", inplace = True) 
income1["University Degree"].fillna("Other", inplace = True) 

income1[income1.isnull().any(axis=1)].fillna(0, inplace=True) # option 3

X_test_prepared = full_pipeline.transform(income1)

income_predictions1 = lin_reg.predict(X_test_prepared)
df = pd.DataFrame(income_predictions1, columns=['Income'])
df.to_csv(r"C:\Users\Bharat\Music\ML\predictions.csv")

ValueError: Found unknown categories ['administrative manager', 'coach', 'certified it administrator', 'auditor', 'aerospace engineer', 'application solution manager', 'asset manager', 'cashier', 'assistant business services associate', 'astronomer', 'apparel patternmaker', 'brokerage clerk', 'accountant', 'clinical case supervisor', 'apprentice inspector', 'asset management specialist', 'baggage porter', 'account executive ', 'community supervisor', 'back end developer'] in column 0 during transform

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10], 'max_features': [2, 4]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(income_prepared, income_labels)

In [None]:
grid_search.best_params_ 

In [None]:
""""import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import hashlib
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import FeatureUnion
from sklearn import metrics
import pandas as pd

dataset = pd.read_csv(r"C:\Users\Bharat\Music\ML\label.csv",)
median1 = dataset["Year of Record"].median()
dataset["Year of Record"].fillna(median1, inplace=True)
X = dataset['Year of Record'].values.reshape(-1,1)
y = dataset['Income in EUR'].values.reshape(-1,1)

#new_dataset = pd.read_csv(r"C:\Users\Bharat\Music\ML\Nolabel.csv",)
#median1 = new_dataset["Year of Record"].median()
#new_dataset["Year of Record"].fillna(median1, inplace=True)
#X1 = new_dataset['Year of Record'].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

regressor = LinearRegression()  
regressor.fit(X_train, y_train)
print(regressor.intercept_)
print(regressor.coef_)
y_pred = regressor.predict(X_test)
#df = pd.DataFrame(y_pred, columns=['Income'])
#df.to_csv(r"C:\Users\Bharat\Music\ML\Nolabel.csv")

print(y_pred)"""