In \[72\]:

    import pygad
    import numpy
    from sklearn.datasets import fetch_california_housing
    from sklearn.linear_model import LinearRegression
    import pandas as pd
    import pygad
    import numpy as np

.. \_california_housing_dataset:

## California Housing dataset<a href="#California-Housing-dataset" class="anchor-link">¶</a>

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
<https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html>

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars (\$100,000).

This dataset was derived from the 1990 U.S. census, using one row per
census block group. A block group is the smallest geographical unit for
which the U.S. Census Bureau publishes sample data (a block group
typically has a population of 600 to 3,000 people).

An household is a group of people residing within a home. Since the
average number of rooms and bedrooms in this dataset are provided per
household, these columns may take surpinsingly large values for block
groups with few households and many empty houses, such as vacation
resorts.

It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.

.. topic:: References

    - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
      Statistics and Probability Letters, 33 (1997) 291-297

In \[73\]:

    data = fetch_california_housing()
    X = pd.DataFrame(data.data, columns=data.feature_names).values
    y = pd.DataFrame(data.target).values

In \[74\]:

    model = LinearRegression()
    model.fit(X,y)

Out\[74\]:

    LinearRegression()

In \[76\]:

    #function_inputs = [1,1,1,1,1,1,1,1]
    #desired_output = 5

In \[84\]:

    def fitness_func(solution, solution_idx):
        #output = numpy.sum(solution*function_inputs)
        b = np.array(solution).reshape(-1,8)
        output = model.predict(b)[0][0]
        #print(solution)
        #fitness = 1.0 / numpy.abs(output - desired_output)
        return output

In \[112\]:

    fitness_function = fitness_func

    num_generations = 1000
    num_parents_mating = 40

    sol_per_pop = 100
    num_genes = 8

    init_range_low = 0
    init_range_high = 10

    parent_selection_type = "sss"
    keep_parents = 1

    crossover_type = "single_point"

    mutation_type = "random"
    mutation_percent_genes = 20

In \[113\]:

    ga_instance = pygad.GA(num_generations=num_generations,
                           num_parents_mating=num_parents_mating,
                           fitness_func=fitness_function,
                           sol_per_pop=sol_per_pop,
                           num_genes=num_genes,
                         init_range_low=init_range_low,
                           init_range_high=init_range_high,
                           parent_selection_type=parent_selection_type,
                           keep_parents=keep_parents,
                           crossover_type=crossover_type,
                           mutation_type=mutation_type,
                           mutation_percent_genes=mutation_percent_genes)

In \[114\]:

    ga_instance.run()

In \[115\]:

    solution, solution_fitness, solution_idx = ga_instance.best_solution()
    print("Parameters of the best solution : {solution}".format(solution=solution))
    print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))

    prediction = numpy.sum(numpy.array(function_inputs)*solution)
    print("Predicted output based on the best solution : {prediction}".format(prediction=prediction))

    Parameters of the best solution : [ 252.77072968    3.57908148  -45.80824313  293.63544812    4.77642485
       -2.58326315 -205.88454344 -228.77828209]
    Fitness value of the best solution = 453.9647129571303
    Predicted output based on the best solution : 71.70735232688185

In \[116\]:

    model.predict(solution.reshape(-1,8))

Out\[116\]:

    array([[453.96471296]])

In \[117\]:

    solution

Out\[117\]:

    array([ 252.77072968,    3.57908148,  -45.80824313,  293.63544812,
              4.77642485,   -2.58326315, -205.88454344, -228.77828209])

In \[106\]:

    X[0]

Out\[106\]:

    array([   8.3252    ,   41.        ,    6.98412698,    1.02380952,
            322.        ,    2.55555556,   37.88      , -122.23      ])

In \[91\]:

    data = fetch_california_housing()
    x = pd.DataFrame(data.data, columns=data.feature_names)
    #y = pd.DataFrame(data.target)

In \[92\]:

    x.info()

    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 20640 entries, 0 to 20639
    Data columns (total 8 columns):
     #   Column      Non-Null Count  Dtype  
    ---  ------      --------------  -----  
     0   MedInc      20640 non-null  float64
     1   HouseAge    20640 non-null  float64
     2   AveRooms    20640 non-null  float64
     3   AveBedrms   20640 non-null  float64
     4   Population  20640 non-null  float64
     5   AveOccup    20640 non-null  float64
     6   Latitude    20640 non-null  float64
     7   Longitude   20640 non-null  float64
    dtypes: float64(8)
    memory usage: 1.3 MB

In \[93\]:

    x.describe()

Out\[93\]:

|       | MedInc       | HouseAge     | AveRooms     | AveBedrms    | Population   | AveOccup     | Latitude     | Longitude    |
|-------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
| count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
| mean  | 3.870671     | 28.639486    | 5.429000     | 1.096675     | 1425.476744  | 3.070655     | 35.631861    | -119.569704  |
| std   | 1.899822     | 12.585558    | 2.474173     | 0.473911     | 1132.462122  | 10.386050    | 2.135952     | 2.003532     |
| min   | 0.499900     | 1.000000     | 0.846154     | 0.333333     | 3.000000     | 0.692308     | 32.540000    | -124.350000  |
| 25%   | 2.563400     | 18.000000    | 4.440716     | 1.006079     | 787.000000   | 2.429741     | 33.930000    | -121.800000  |
| 50%   | 3.534800     | 29.000000    | 5.229129     | 1.048780     | 1166.000000  | 2.818116     | 34.260000    | -118.490000  |
| 75%   | 4.743250     | 37.000000    | 6.052381     | 1.099526     | 1725.000000  | 3.282261     | 37.710000    | -118.010000  |
| max   | 15.000100    | 52.000000    | 141.909091   | 34.066667    | 35682.000000 | 1243.333333  | 41.950000    | -114.310000  |

In \[ \]: