In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
data = pd.read_csv('../covtype.data', header=None)

columns = [
    'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points'
]
wilderness_areas = [f'Wilderness_Area_{i}' for i in range(1, 5)]
soil_types = [f'Soil_Type_{i}' for i in range(1, 41)]
columns.extend(wilderness_areas)
columns.extend(soil_types)
columns.append('Cover_Type')
data.columns = columns
data

In [None]:
data.loc[data['Cover_Type'] == 2]

In [None]:
data.loc[data['Cover_Type'] == 5]

In [None]:
cols = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points', 'Cover_Type']
data_subset = data[cols]
correlation_matrix = np.corrcoef(data_subset.values.T)

fig, ax = plt.subplots(figsize=(7, 7))
sns.set(font_scale=1.1)
sns.heatmap(data=correlation_matrix, square=True, cbar=True, annot=True, annot_kws={'size': 10}, xticklabels=cols, yticklabels=cols, fmt=".2f", linewidth=.5, cmap=sns.cubehelix_palette(as_cmap=True))

In [None]:
max_val_E = data['Elevation'].max()
min_val_E = data['Elevation'].min()
print(f"Maximum value of 'Elevation' - row0: {max_val_E}")
print(f"Minimum value of 'Elevation' - row0: {min_val_E}")

max_val_A = data['Aspect'].max()
min_val_A = data['Aspect'].min()
print(f"Maximum value of 'Aspect' - row1: {max_val_A}")
print(f"Minimum value of 'Aspect' - row1: {min_val_A}")

max_val_HH = data['Horizontal_Distance_To_Hydrology'].max()
min_val_HH = data['Horizontal_Distance_To_Hydrology'].min()
print(f"Maximum value of 'Horizontal_Distance_To_Hydrology' - row3: {max_val_HH}")
print(f"Minimum value of 'Horizontal_Distance_To_Hydrology' - row3: {min_val_HH}")

max_val_HR = data['Horizontal_Distance_To_Roadways'].max()
min_val_HR = data['Horizontal_Distance_To_Roadways'].min()
print(f"Maximum value of 'Horizontal_Distance_To_Roadways' - row5: {max_val_HR}")
print(f"Minimum value of 'Horizontal_Distance_To_Roadways' - row5: {min_val_HR}")

max_val_HFP = data['Horizontal_Distance_To_Fire_Points'].max()
min_val_HFP = data['Horizontal_Distance_To_Fire_Points'].min()
print(f"Maximum value of 'Horizontal_Distance_To_Fire_Points' - row9: {max_val_HFP}")
print(f"Minimum value of 'Horizontal_Distance_To_Fire_Points' - row9: {min_val_HFP}")

In [None]:
def make_hist(col_name, bins_val):
    min = data[col_name].min()
    max = data[col_name].max()
    print(f"Lowest {col_name} value: {min}. Highest {col_name} value: {max}.")
    plot_hist = data[col_name].plot.hist(bins=bins_val, grid=True)
    plot_hist.set_title(f"Represents number of cars for each production {col_name.upper()} category")
    plot_hist.set_xlabel(f"{col_name}")
    plot_hist.set_ylabel("Number of observations")
    plt.show()

In [None]:
make_hist("Elevation", 57)

In [None]:
make_hist("Aspect", 57)

In [None]:
make_hist("Slope", 57)

In [None]:
make_hist("Horizontal_Distance_To_Hydrology", 57)

In [None]:
make_hist("Horizontal_Distance_To_Roadways", 57)

In [None]:
make_hist("Horizontal_Distance_To_Fire_Points", 57)

In [None]:
X = data.drop('Cover_Type', axis=1)
y = data['Cover_Type']

In [None]:
mean_E = data['Elevation'].mean()
print(mean_E)
mean_A = data['Aspect'].mean()
print(mean_A)
mean_HH = data['Horizontal_Distance_To_Hydrology'].mean()
print(mean_HH)
mean_HR = data['Horizontal_Distance_To_Roadways'].mean()
print(mean_HR)
mean_S = data['Slope'].mean()
print(mean_S)
mean_VDH = data['Vertical_Distance_To_Hydrology'].mean()
print(mean_VDH)
mean_HFP = data['Horizontal_Distance_To_Fire_Points'].mean()
print(mean_HFP)

In [None]:
def my_heuristic_algorithm(row):
    if row['Elevation'] > 3000 and row['Slope'] < 20:
        return 1
    elif row['Elevation'] < 3000 and row['Slope'] < 15:
        return 2
    elif row['Elevation'] < 2500 and row['Slope'] > 10:
        return 3
    elif 180 < row['Aspect'] < 150 and row['Hillshade_Noon'] > 200 and  row['Horizontal_Distance_To_Roadways'] > 2000:
        return 4
    elif 270 < row['Aspect'] < 300 and row['Hillshade_3pm'] > 150 and  row['Horizontal_Distance_To_Roadways'] < 2000:
        return 5
    elif row['Wilderness_Area_1'] == 0 and row['Soil_Type_10'] == 1:
        return 6
    else:
        return 7
data['predicted_cover_type'] = data.apply(my_heuristic_algorithm, axis=1)

# # Wykorzystanie algorytmu heurystycznego do dokonania predykcji
# y_pred = X.apply(my_heuristic_algorithm, axis=1)
#
# # Wyświetlenie wyników
# print(y_pred.value_counts())

# Calculate the percentage of samples that are classified correctly
accuracy = (data['predicted_cover_type'] == data['Cover_Type']).mean() * 100
print(f'Accuracy: {accuracy:.2f}%')

In [None]:
sample_row = pd.Series({
    'Elevation': 2800,
    'Slope': 12,
    'Aspect': 220,
    'Hillshade_Noon': 220,
    'Wilderness_Area_3': 1,
    'Horizontal_Distance_To_Hydrology': 120,
    'Hillshade_9am': 160
})

# Predict the cover type for the sample row
predicted_cover_type = my_heuristic_algorithm(sample_row)

print(predicted_cover_type)