## Knjižnice in spremenljivke


In [136]:
import pandas as pd
import folium as f
from datetime import datetime
import collections
import matplotlib.pyplot as plt
import plotly.express as px
from dateutil.relativedelta import relativedelta
import plotly.offline as pyo

%store -r comm_data
%store -r cars
%store -r population
pyo.init_notebook_mode(connected=True)

cars['B-Datum prve registracije vozila'] = pd.to_datetime(cars['B-Datum prve registracije vozila'], format='%d.%m.%Y')
cars['P13-Vrsta goriva (opis)'].replace('Ni goriva', 'Elektrika')

0          Dizel
1         Bencin
2         Bencin
3         Bencin
5          Dizel
           ...  
475864    Bencin
475866    Bencin
475867     Dizel
475871     Dizel
475872     Dizel
Name: P13-Vrsta goriva (opis), Length: 346465, dtype: object

### Porazdelitev registriranih avtov po letih


In [137]:
import plotly.express as px

years = list([x.year for x in cars['B-Datum prve registracije vozila']])
years = filter(lambda x: x >= 1975, years)
years = dict(sorted(collections.Counter(years).items(), reverse=True))

fig = px.bar(x=list(years.keys()), y=list(years.values()))
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of registered cars',
    title='Number of registered cars by year',
    xaxis=dict(range=[1975, 2023])
)
fig.update_layout(dict( height=800))

pyo.iplot(fig)


In [138]:
import plotly.graph_objects as go

cumulative_counts = []
cumulative_count = 0
for year, count in list(years.items())[::-1]:
    cumulative_count += count
    cumulative_counts.append(cumulative_count)

fig = go.Figure(data=[go.Scatter(x=list(years.keys())[::-1], y=cumulative_counts, mode='lines', stackgroup='one')])
fig.update_layout(
    title='',
)
fig.update_layout(dict( height=800))
pyo.iplot(fig)


### Najpopularnejša znamka


In [144]:
car_brands = {
    "VOLKSWAGEN": "Germany",
    "OPEL": "Germany",
    "FORD": "United States",
    "BMW": "Germany",
    "MERCEDES BENZ": "Germany",
    "AUDI": "Germany",
    "RENAULT": "France",
    "CITROEN": "France",
    "PEUGEOT": "France",
    "ŠKODA": "Czech Republic",
    "FIAT": "Italy",
    "KIA": "South Korea",
    "HYUNDAI": "South Korea",
    "SEAT": "Spain",
    "VOLVO": "Sweden",
    "DACIA": "Romania",
    "NISSAN": "Japan",
    "MAZDA": "Japan",
    "TOYOTA": "Japan",
    "SUZUKI": "Japan",
    "HONDA": "Japan",
}


In [194]:
brand = collections.Counter(cars['D1-Znamka'])
# Convert the dictionary to a DataFrame
df = pd.DataFrame({'brand': list(brand.keys()), 'value': list(brand.values())})

# Compute the total value of all brands
total_value = df['value'].sum()
# Compute the percentage of each brand
df['country'] = df['brand'].map(car_brands).fillna("Preostale države")
sum_other = df.loc[df['country'] == 'Preostale države','value'].sum()
df = df[df['country'] != 'Preostale države'].copy()

new_row = [{'brand': 'Preostale znamke', 'value': sum_other,'country':'Drugo'}]
df = pd.concat([df, pd.DataFrame(new_row)], ignore_index=True)
df['percent'] = (df['value'] / total_value) * 100


# Create the hover text to display in each rectangle
df['text'] = df['brand'] + '<br>' + df['percent'].round(2).astype(str) + '%'
# Convert the RGB colors to Plotly color strings
color_strings = ['rgb({},{},{})'.format(int(c[0]*255), int(c[1]*255), int(c[2]*255)) for c in colors]
# Create the treemap using Plotly
fig = px.treemap(df, path=[ 'country','text'], values='value')

fig.update_layout(dict( height=800))
pyo.iplot(fig)

### Delež goriv na leto 

In [143]:
import plotly.graph_objects as go


years_fuel_dict = dict()
gasoline_ratios= list()
diesel_ratios= list()
electricity_ratios= list()
for y in range(1975, 2022):
    car_fuel = cars[cars['B-Datum prve registracije vozila'].dt.year == y]
    car_fuel = collections.Counter(car_fuel['P13-Vrsta goriva (opis)'])
    for key in ['Bencin', 'Dizel', 'Ni goriva']:
        if key is None or key not in car_fuel:
            car_fuel[key] = 0;
    total = car_fuel['Bencin']+ car_fuel['Dizel']+ car_fuel['Ni goriva'];
    if total!=0:
        gasoline_ratios.append(car_fuel['Bencin']/total*100)
        diesel_ratios.append(car_fuel['Dizel']/total*100)
        electricity_ratios.append(car_fuel['Ni goriva']/total*100)
        years_fuel_dict[y]= [car_fuel['Bencin']/total*100, car_fuel['Dizel']/total*100, car_fuel['Ni goriva']/total*100]



trace1 = go.Bar(x=list(years_fuel_dict.keys()), y=gasoline_ratios, name='Gasoline')
trace2 = go.Bar(x=list(years_fuel_dict.keys()), y=diesel_ratios, name='Diesel')
trace3 = go.Bar(x=list(years_fuel_dict.keys()), y=electricity_ratios, name='Electricity')

layout = go.Layout(
    title='Percentages of different fuels by year',
    barmode='stack',
    xaxis=dict(
        tickmode='array',
        tickvals=list(years_fuel_dict.keys()),
        ticktext=[str(year) + ' ' * 8 for year in years_fuel_dict.keys()]  # Adjust the spacing as needed
    ),

    margin=dict(
        l=80,  # Adjust the left margin value as needed
        r=50,
        b=50,
        t=80,
        pad=4
    )
)

# Create the figure
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
fig.update_layout(dict( height=1000))              

# Show the figure
pyo.iplot(fig)


### Razmerje med kombinirano porabo vozila in močjo vozila za različne starostne skupine

In [184]:
import math
import plotly.express as px
import numpy as np

filtered_cars = cars
filtered_cars['P12-Nazivna moc'] = pd.to_numeric(cars['P12-Nazivna moc'], errors='coerce')
# Remove rows where 'column_name' is NaN
filtered_cars = filtered_cars.dropna(subset=['P12-Nazivna moc'])
filtered_cars['V8-Kombinirana poraba goriva'] = pd.to_numeric(filtered_cars['V8-Kombinirana poraba goriva'], errors='coerce')
# Remove rows where 'column_name' is NaN
filtered_cars = filtered_cars.dropna(subset=['V8-Kombinirana poraba goriva'])

filtered_cars['V8-Kombinirana poraba goriva'] = filtered_cars['V8-Kombinirana poraba goriva'].astype(float)
filtered_cars['P12-Nazivna moc'] = filtered_cars['P12-Nazivna moc'].astype(float)

# Get a boolean series representing which row satisfies the condition
mask = filtered_cars['P13-Vrsta goriva (opis)'].isin(['Bencin', 'Dizel'])
# Filter the dataframe using the boolean series
filtered_cars = filtered_cars[mask]

# Define the bin edges
bin_edges = [0,30,55,200]

# Define the bin labels
bin_labels = ['under 25', 'from 25 to 50', 'above 50']

# Discretize the 'C-Starost uporabnika vozila' column into age groups
filtered_cars['Age Group'] = pd.cut(filtered_cars['C-Starost uporabnika vozila'], bins=bin_edges, labels=bin_labels)

# Count the number of cars in each age group
age_group_counts = collections.Counter(filtered_cars['Age Group'])
display(pd.DataFrame(columns=[list(age_group_counts.keys()),list(age_group_counts.values())]))
# Find the minimum count among age groups
min_count = min(age_group_counts.values())

# Scatter plot of nominal power vs combined fuel consumption with color by fuel type
fig = px.scatter(filtered_cars, x="P12-Nazivna moc", y="V8-Kombinirana poraba goriva", color="Age Group")
fig.update_yaxes(range=[0, 20])
fig.update_xaxes(range=[0, 450])
fig.update_layout(title='Car engine power in relation to combined fuel usage',
                   xaxis_title='Car engine power (KW)',
                   yaxis_title='Combined fuel usage (l/100km)')
fig.update_layout(dict(height=1000))              
fig.update_traces(marker=dict(size=11))

fig.show()


# Create a new DataFrame with equal number of points for each age group
equal_sampled_cars = pd.DataFrame()
for age_group, count in age_group_counts.items():
    if len(filtered_cars[filtered_cars['Age Group'] == age_group])>0:
        age_group_data = filtered_cars[filtered_cars['Age Group'] == age_group].sample(n=min_count, random_state=12)
        equal_sampled_cars = pd.concat([equal_sampled_cars, age_group_data])

# Scatter plot with equal number of points for each age group
fig = px.scatter(equal_sampled_cars, x="P12-Nazivna moc", y="V8-Kombinirana poraba goriva", color="Age Group")
fig.update_yaxes(range=[0, 20])
fig.update_xaxes(range=[0, 450])
fig.update_layout(title='Car engine power in relation to combined fuel usage, equalized',
                   xaxis_title='Car engine power (KW)',
                   yaxis_title='Combined fuel usage (l/100km)')
fig.update_layout(dict(height=1000))              
fig.update_traces(marker=dict(size=11))

fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,from 25 to 50,above 50,under 25,NaN
Unnamed: 0_level_1,13896,9738,2509,2358


In [192]:
import pandas as pd
import plotly.graph_objects as go
filtered_cars = cars
filtered_cars['P12-Nazivna moc'] = pd.to_numeric(cars['P12-Nazivna moc'], errors='coerce')
# Remove rows where 'column_name' is NaN
filtered_cars = filtered_cars.dropna(subset=['P12-Nazivna moc'])
filtered_cars['V8-Kombinirana poraba goriva'] = pd.to_numeric(filtered_cars['V8-Kombinirana poraba goriva'], errors='coerce')
# Remove rows where 'column_name' is NaN
filtered_cars = filtered_cars.dropna(subset=['V8-Kombinirana poraba goriva'])

filtered_cars['V8-Kombinirana poraba goriva'] = filtered_cars['V8-Kombinirana poraba goriva'].astype(float)
filtered_cars['P12-Nazivna moc'] = filtered_cars['P12-Nazivna moc'].astype(float)
# Assuming your dataset is stored in a DataFrame called 'cars_data'
correlation_matrix = cars.corr()

# Create a heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,  # Correlation values
    x=correlation_matrix.columns,  # X-axis labels
    y=correlation_matrix.columns,  # Y-axis labels
    colorscale='RdBu',  # Color scale (you can choose a different one)
))

fig.update_layout(
    title='Correlation Heatmap of Attributes',
    xaxis_title='Attributes',
    yaxis_title='Attributes'
)
fig.update_layout(dict(height=1000,width=1000))  
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



