In [10]:
import numpy as np
import matplotlib.pyplot as plt
import random 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
import sys
from typing import Sequence
sys.path.append("..")
from geneticalg.core.AbstractSolver import AbstractSolver

In [2]:
df1 = pd.read_csv("communities.data" , header=None)
df1.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.0,?,0.67
2,24,?,?,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,?,?,?,?,0.0,?,0.43
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,?,?,?,?,0.0,?,0.12
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,?,?,?,?,0.0,?,0.03


In [3]:
def read_header(filename):
    '''
    Given a filename containing headers, extract the headers and assign it to df
    '''

    header_list = []
    with open(filename) as f:
        for line in f:
            if "@attribute" in line:
                header_list.append(line.split(" ")[1])
    return header_list
    
headers = read_header("communities.names")
for i in headers:
    print(i)
df1.columns = headers

state
county
community
communityname
fold
population
householdsize
racepctblack
racePctWhite
racePctAsian
racePctHisp
agePct12t21
agePct12t29
agePct16t24
agePct65up
numbUrban
pctUrban
medIncome
pctWWage
pctWFarmSelf
pctWInvInc
pctWSocSec
pctWPubAsst
pctWRetire
medFamInc
perCapInc
whitePerCap
blackPerCap
indianPerCap
AsianPerCap
OtherPerCap
HispPerCap
NumUnderPov
PctPopUnderPov
PctLess9thGrade
PctNotHSGrad
PctBSorMore
PctUnemployed
PctEmploy
PctEmplManu
PctEmplProfServ
PctOccupManu
PctOccupMgmtProf
MalePctDivorce
MalePctNevMarr
FemalePctDiv
TotalPctDiv
PersPerFam
PctFam2Par
PctKids2Par
PctYoungKids2Par
PctTeen2Par
PctWorkMomYoungKids
PctWorkMom
NumIlleg
PctIlleg
NumImmig
PctImmigRecent
PctImmigRec5
PctImmigRec8
PctImmigRec10
PctRecentImmig
PctRecImmig5
PctRecImmig8
PctRecImmig10
PctSpeakEnglOnly
PctNotSpeakEnglWell
PctLargHouseFam
PctLargHouseOccup
PersPerOccupHous
PersPerOwnOccHous
PersPerRentOccHous
PctPersOwnOccup
PctPersDenseHous
PctHousLess3BR
MedNumBR
HousVacant
PctHousOccup
PctHo

In [4]:
def drop_columns(columns , df):
    '''
    Given dataframe , returns updated df with removed colums
    '''
    for i in columns:
        df = df.drop(i , axis=1)

    return df

drop_list = ['state' , 'county' , 'community' , 'communityname' , 'fold' ]
df1 = drop_columns(drop_list , df1)


In [5]:
def find_missing_values(df):
    '''
    Gets the missing value indices
    '''
    missing_values = []
    missing_values_each_row = []
    for i in range(df.shape[0]):
        if "?" in list(df.iloc[i].values):
            missing_values.append(i)

    for column in df.columns:
        count = df[df[column] == "?"].shape[0]
        missing_values_each_row.append((column , count))
    

    for column,count in missing_values_each_row:
        if count == 1675:
            df = df.drop(column , axis=1)
            
    df = df[df["OtherPerCap"] != "?"]

    return df

df1 = find_missing_values(df1)

In [6]:
x = df1.drop("ViolentCrimesPerPop", axis=1)
y = df1["ViolentCrimesPerPop"]
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=24)
print(x_train.shape , y_train.shape)
print(x_test.shape , y_test.shape)

(1594, 100) (1594,)
(399, 100) (399,)


In [58]:
dist = []

In [59]:
def fitness_func(chromosome):
    columns = []
    for i in range(len(x_train.columns)):
        if i in chromosome:
            columns.append(x_train.columns[i])
    dist.append(columns)
    training_set = x_train[columns]
    test_set = x_test[columns]
    lg = LinearRegression().fit(training_set.values, y_train.values)
    preds = lg.predict(test_set.values)
    return 100 / np.sqrt(mean_squared_error(y_test.values, preds))
    



In [84]:
class FeatureSolver(AbstractSolver):
    def __init__(
        self,
        problem_type=float,
        fitness_func= lambda a : fitness_func(a),
        pop_cnt: int = 100,
        gene_size: int = 50,
        max_gen: int = 100,
        mutation_ratio: float = 0.2,
        selection_ratio: float = 0.2,
        selection_type: str = "roulette_wheel",
        mutation_type: str = "insert",
        crossover_type: str = "one_point",
        excluded_genes: Sequence = None,
        variables_limits=(-10, 10),
        verbose: bool = False,
        cv=0,
        **kwargs
    ):
        """
        :param fitness_function: can either be a fitness function or
        a class implementing a fitness function + methods to override
        the default ones: create_offspring, mutate_population, initialize_population
        :param n_genes: number of genes (variables) to have in each chromosome
        :param max_gen: maximum number of generations to perform the optimization
        :param pop_size: population size
        :param mutation_rate: rate at which random mutations occur
        :param selection_rate: percentage of the population to be selected for crossover
        :param selection_strategy: strategy to use for selection
        :param verbose: whether to print iterations status
        :param show_stats: whether to print stats at the end
        :param plot_results: whether to plot results of the run at the end
        :param variables_limits: limits for each variable [(x1_min, x1_max), (x2_min, x2_max), ...].
        If only one tuple is provided, then it is assumed the same for every variable
        :param problem_type: whether problem is of float or integer type
        """

        AbstractSolver.__init__(
            self,
            problem_type=problem_type,
            gene_size=gene_size,
            fitness_func=fitness_func,
            pop_cnt=pop_cnt,
            max_gen=max_gen,
            mutation_ratio=mutation_ratio,
            selection_ratio=selection_ratio,
            selection_type=selection_type,
            mutation_type=mutation_type,
            crossover_type=crossover_type,
            excluded_genes=excluded_genes,
            verbose=verbose,
            cv=cv,
            **kwargs
        )
        
    def initialize_population(self):
        """
        Initializes the population of the problem according to the
        population size and number of genes and according to the problem
        type (either integers or floats).

        :return: a numpy array with a randomized initialized population
        """
        population = np.empty(shape=(self.pop_cnt, self.gene_size))
        for i in range(0,self.pop_cnt):
            population[i] = random.sample(range(0,99) , self.gene_size)


        return population

random.seed(10)        
solver = FeatureSolver(
        gene_size=50,
        cv=0 
    )

list(solver.solve())

Iter number: 1
Best fitness: 0.0013614502153978649
best individual: [81. 73. 90. 84. 47. 21. 67. 35. 40. 24. 64. 69. 26. 55. 25. 68. 57. 59.
 74. 87. 38. 82.  0. 44. 76. 33. 92. 32. 13. 52. 79. 78. 85.  9. 72. 20.
 66. 49. 22. 65. 62.  7.  4. 89. 96. 17. 11. 15.  2. 93.]
Iter number: 2
Best fitness: 0.0013614502153978649
best individual: [81. 73. 90. 84. 47. 21. 67. 35. 40. 24. 64. 69. 26. 55. 25. 68. 57. 59.
 74. 87. 38. 82.  0. 44. 76. 33. 92. 32. 13. 52. 79. 78. 85.  9. 72. 20.
 66. 49. 22. 65. 62.  7.  4. 89. 96. 17. 11. 15.  2. 93.]
Iter number: 3
Best fitness: 0.0013614502153978649
best individual: [81. 73. 90. 84. 47. 21. 67. 35. 40. 24. 64. 69. 26. 55. 25. 68. 57. 59.
 74. 87. 38. 82.  0. 44. 76. 33. 92. 32. 13. 52. 79. 78. 85.  9. 72. 20.
 66. 49. 22. 65. 62.  7.  4. 89. 96. 17. 11. 15.  2. 93.]
Iter number: 4
Best fitness: 0.0013614502153978649
best individual: [81. 73. 90. 84. 47. 21. 67. 35. 40. 24. 64. 69. 26. 55. 25. 68. 57. 59.
 74. 87. 38. 82.  0. 44. 76. 33. 92. 32. 13

KeyboardInterrupt: 