# Population in Ireland

# Adım 1: Veri Ön İşleme ve Temizleme

## 1. Veri setinin yüklenmesi ve temel kontrol

In [1]:
import pandas as pd               # Used for data analysis and processing.
import numpy as np                # Used for multi-dimensional arrays and mathematical operations.
import seaborn as sns             # Provides a high-level interface for data visualization.
import matplotlib.pyplot as plt   # Used for creating plots and graphs.
%matplotlib inline
sns.set(color_codes=True)

In [2]:
import math
from scipy.stats import poisson

In [3]:
import warnings
warnings.filterwarnings('ignore') # We can suppress the warnings

In [4]:
df=pd.read_csv("PEA01-3.csv") # we read dataset

In [5]:
df.head(5) # We dispplay first 5 rows

Unnamed: 0,STATISTIC Label,Year,Age Group,Sex,UNIT,VALUE
0,Population Estimates (Persons in April),1950,0 - 14 years,Both sexes,Thousand,
1,Population Estimates (Persons in April),1950,0 - 14 years,Male,Thousand,434.6
2,Population Estimates (Persons in April),1950,0 - 14 years,Female,Thousand,416.6
3,Population Estimates (Persons in April),1950,15 - 24 years,Both sexes,Thousand,452.6
4,Population Estimates (Persons in April),1950,15 - 24 years,Male,Thousand,234.9


In [6]:
df.tail(5) # We display last 5 rows

Unnamed: 0,STATISTIC Label,Year,Age Group,Sex,UNIT,VALUE
1327,Population Estimates (Persons in April),2023,65 years and over,Male,Thousand,379.9
1328,Population Estimates (Persons in April),2023,65 years and over,Female,Thousand,426.4
1329,Population Estimates (Persons in April),2023,All ages,Both sexes,Thousand,5281.6
1330,Population Estimates (Persons in April),2023,All ages,Male,Thousand,2606.2
1331,Population Estimates (Persons in April),2023,All ages,Female,Thousand,2675.4


In [7]:
# We calculate how many we have rows and columns with shape method
total_rows=df.shape[0]
total_columns=df.shape[1]
print(f"Total Rows= {total_rows}")
print(f"Total Columns= {total_columns}")

Total Rows= 1332
Total Columns= 6


In [8]:
df.describe() # We use this method to see summary of columns numerical (numeric) data

Unnamed: 0,Year,VALUE
count,1332.0,1318.0
mean,1986.5,808.058877
std,21.368032,868.888579
min,1950.0,149.3
25%,1968.0,310.775
50%,1986.5,477.7
75%,2005.0,870.475
max,2023.0,5281.6


In [9]:
df.describe(include="object") # We use this method to see summary of columns containing categorical (object) data

Unnamed: 0,STATISTIC Label,Age Group,Sex,UNIT
count,1332,1332,1332,1332
unique,1,6,3,1
top,Population Estimates (Persons in April),0 - 14 years,Both sexes,Thousand
freq,1332,222,444,1332


In [10]:
df.dtypes # We use this method to see the data types of each columns

STATISTIC Label     object
Year                 int64
Age Group           object
Sex                 object
UNIT                object
VALUE              float64
dtype: object

## 2. Temizlik ve veri önişleme 

In [11]:
df.nunique() # We use this method to find the count of different values in each column.

STATISTIC Label       1
Year                 74
Age Group             6
Sex                   3
UNIT                  1
VALUE              1214
dtype: int64

In [12]:
df.count() # We use this method to calculate the number of non-null (non-NaN) values in each column.

STATISTIC Label    1332
Year               1332
Age Group          1332
Sex                1332
UNIT               1332
VALUE              1318
dtype: int64

In [13]:
df.isnull().sum() # We use this method to calculate the number of null (NaN) values in each column.

STATISTIC Label     0
Year                0
Age Group           0
Sex                 0
UNIT                0
VALUE              14
dtype: int64

In [14]:
df.duplicated().sum() # We use this method to calculate the number of duplicated rows

0

In [15]:
# We drop some columns which we do not use because these columns include just one unique value
to_drop=["STATISTIC Label","UNIT"] 
df.drop(columns=to_drop,inplace=True)

In [16]:
# We rename the 'VALUE' column to maintain consistent column title style.
df=df.rename(columns={"VALUE":"Value"})

In [17]:
#df["Value"]=(df["Value"]*10) # we changed the value because we drop UNIT column where it is writing exact value label

In [19]:
#df["Value"]

In [20]:
missing_df=df[df.isna().any(axis=1)] #missing data rows
missing_df

Unnamed: 0,Year,Age Group,Sex,Value
0,1950,0 - 14 years,Both sexes,
72,1954,0 - 14 years,Both sexes,
216,1962,0 - 14 years,Both sexes,
234,1963,0 - 14 years,Both sexes,
360,1970,0 - 14 years,Both sexes,
396,1972,0 - 14 years,Both sexes,
522,1979,0 - 14 years,Both sexes,
576,1982,0 - 14 years,Both sexes,
684,1988,0 - 14 years,Both sexes,
774,1993,0 - 14 years,Both sexes,


In [21]:
import pandas as pd
import numpy as np

# df, veri setinizi temsil etmektedir.

# Her yıl ve yaş grubu için 'Male' ve 'Female' cinsiyetlerinin 'Value' değerlerini toplayın
grouped_values = df[(df['Sex'] == 'Male') | (df['Sex'] == 'Female')].groupby(['Year', 'Age Group'])['Value'].sum().reset_index()

# 'Both sexes' için eksik 'Value' değerlerini bu toplamlarla doldurun
for index, row in grouped_values.iterrows():
    year = row['Year']
    age_group = row['Age Group']
    total_value = row['Value']

    # Eksik değerleri toplam değerle doldurun
    df.loc[(df['Year'] == year) & (df['Age Group'] == age_group) & (df['Sex'] == 'Both sexes') & (df['Value'].isna()), 'Value'] = total_value

# Sonuçları kontrol edin
print(df)


      Year          Age Group         Sex   Value
0     1950       0 - 14 years  Both sexes   851.2
1     1950       0 - 14 years        Male   434.6
2     1950       0 - 14 years      Female   416.6
3     1950      15 - 24 years  Both sexes   452.6
4     1950      15 - 24 years        Male   234.9
...    ...                ...         ...     ...
1327  2023  65 years and over        Male   379.9
1328  2023  65 years and over      Female   426.4
1329  2023           All ages  Both sexes  5281.6
1330  2023           All ages        Male  2606.2
1331  2023           All ages      Female  2675.4

[1332 rows x 4 columns]


In [22]:
all_0_14=df[(df["Age Group"]=="0 - 14 years")]
all_0_14

Unnamed: 0,Year,Age Group,Sex,Value
0,1950,0 - 14 years,Both sexes,851.2
1,1950,0 - 14 years,Male,434.6
2,1950,0 - 14 years,Female,416.6
18,1951,0 - 14 years,Both sexes,854.8
19,1951,0 - 14 years,Male,436.4
...,...,...,...,...
1297,2022,0 - 14 years,Male,519.0
1298,2022,0 - 14 years,Female,495.3
1314,2023,0 - 14 years,Both sexes,1011.6
1315,2023,0 - 14 years,Male,516.5


## 3. Tek Değişkenli Analiz

In [None]:
a=df[(df["Sex"]=="Both sexes") & (df["Age Group"]=="All ages")]
#a=a[a["Year"]>=2010]
#a

In [None]:
b=df[(df["Age Group"]=="65 years and over") & (df["Sex"]=="Both sexes")]
#b

In [None]:
df_male=df[(df["Age Group"]=="All ages") & (df["Sex"]=="Male")]
df_female=df[(df["Age Group"]=="All ages") & (df["Sex"]=="Female")]

#df_male
#df_female

In [None]:
# 'Value' değişkeni için bir histogram çizelim.
plt.figure(figsize=(10, 6))
sns.histplot(a['Value'], kde=True)
plt.title('Value Değişkeninin Histogramı')
plt.xlabel('Value')
plt.ylabel('Frekans')
plt.show()

In [None]:
# 'Value' değişkeni için bir kutu grafiği çizelim.
plt.figure(figsize=(10, 6))
sns.boxplot(y=a['Value'])
plt.title('Value Değişkeninin Kutu Grafiği')
plt.ylabel('Value')
plt.show()

In [None]:
# 'Year' ve 'Value' arasında bir ilişki olup olmadığını görmek için bir scatter plot çizelim.
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['Year'], y=df['Value'])
plt.title('Yıl ve Value İlişkisi')
plt.xlabel('Year')
plt.ylabel('Value')
plt.show()

In [None]:
# Cinsiyet için çubuk grafik
plt.figure(figsize=(10, 5))
sns.countplot(x='Sex', data=df)
plt.title('Sex Distribution')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.show()

In [None]:
# Yaş Grubu için çubuk grafik
plt.figure(figsize=(15, 7))
sns.countplot(x='Age Group', data=df, order = df['Age Group'].value_counts().index)
plt.title('Age Group Distribution')
plt.xlabel('Age Group')
plt.xticks(rotation=90)
plt.ylabel('Count')
plt.show()

In [None]:
# Yıl ve Nüfus değerleri arasında bir çizgi grafiği
plt.figure(figsize=(15, 7))
sns.lineplot(x='Year', y='Value', data=df, estimator=sum, ci=None)
plt.title('Population over Years')
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()

In [None]:
# Yaş Grubu ve Nüfus değerleri arasında bir çizgi grafiği
# Bu, yaş gruplarına göre toplam nüfusu gösterecektir.
plt.figure(figsize=(15, 7))
sns.lineplot(x='Age Group', y='Value', data=df, estimator=sum, ci=None)
plt.title('Population by Age Group')
plt.xlabel('Age Group')
plt.xticks(rotation=90)
plt.ylabel('Population')
plt.show()

In [None]:
# Cinsiyet ve Nüfus değerleri arasındaki ilişki
plt.figure(figsize=(15, 7))
sns.barplot(x='Sex', y='Value', data=df, estimator=sum)
plt.title('Population by Sex')
plt.xlabel('Sex')
plt.ylabel('Population')
plt.show()

In [None]:
# 'Age Group' ve 'Value' için gruplanmış özet istatistikler
age_group_stats = df.groupby('Age Group')['Value'].describe()
print(age_group_stats)

In [None]:
# 'Sex' ve 'Value' için gruplanmış özet istatistikler
sex_stats = df.groupby('Sex')['Value'].describe()
print(sex_stats)

# Keşifsel Veri Analizi (EDA)

In [None]:
# Veri setinin başını, sonunu, rastgele örneklerini ve özet bilgilerini görüntüleyin.
print(df.head())
print(df.tail())
print(df.sample(5))
print(df.info())
print(df.describe(include='all'))  # Hem sayısal hem de kategorik değişkenler için özet istatistikler


In [None]:
df['Value'].describe()

In [None]:
#df['Age Group'].unique()

In [None]:
#df['Year'].unique()

In [None]:
#df['Sex'].unique()

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Identify options
year_option = df['Year'].unique()
age_group_option = df['Age Group'].unique()
sex_option = df['Sex'].unique()

# Create dropdown widget
dropdown_year = widgets.Dropdown(options=year_option,description="Year:")
dropdown_age_group = widgets.Dropdown(options=age_group_option,description="Age Group:")
dropdown_sex = widgets.Dropdown(options=sex_option,description="Sex:")

# Show Widget
print("Please select a option!")
display(dropdown_year)
display(dropdown_age_group)
display(dropdown_sex)

In [None]:
year = dropdown_year.value
age_group = dropdown_age_group.value
sex= dropdown_sex.value

result = df[(df['Year'] == year) & (df['Age Group'] == age_group) & (df['Sex'] == sex)]['Value'].values[0]
print(f"In '{year}', the result for the age group '{age_group}', and the sex '{sex}' is: {result}")

In [None]:
table_year_age_group = df[(df['Year'] == year) & (df['Age Group'] == age_group)]
table_year_age_group

In [None]:
table_age_group_sex = df[(df['Age Group'] == age_group) & (df["Sex"] == sex)]
table_age_group_sex

In [None]:
df["Year"].unique()

In [None]:
df["Age Group"].unique()

In [None]:
df["Sex"].unique()

In [None]:
print(np.min(df["Value"]))
print(np.max(df["Value"]))
print(np.mean(df["Value"]))
print(np.median(df["Value"]))

In [None]:
#year = dropdown_year.value
#age_group = dropdown_age_group.value
#sex= dropdown_sex.value
#print("Year:", year)
#print("Age Group:", age_group)
#print("Sex:", sex,"\n")

## We print the result for a specific year, age group, and sex.
#result = df[(df['Year'] == year) & (df['Age Group'] == age_group) & (df['Sex'] == sex)]['Value'].values[0]
#print(f"In '{year}', the result for the age group '{age_group}', and the sex '{sex}' is: {result}")

In [None]:
# year = 1965
# age_range = "1 - 4 years"
# sex = "Both sexes"

# # We print the result for a specific year, age range, and gender.
# result = df[(df['Year'] == year) & (df['Age Group'] == age_range) & (df['Sex'] == sex)]['Value'].values[0]
# print(f"In {year}, the result for the age group {age_range}, and the gender {sex} is: {result}")

In [None]:
# toplam_yas = df[(df['Year'] == 1965) & (df['Sex'] == "Both sexes") & (df['Age Group'].isin(['Under 1 year', '1 - 4 years']))]['Value'].sum()
# toplam_yas

In [None]:
#all_0_4=df[(df["Age Group"]=="0 - 4 years")]
#all_0_4.head(5)

In [None]:
#all_0_4.shape

In [None]:
#all_0_4.isnull().sum()

In [None]:
df["Value"] = df["Value"].astype(int)

In [None]:
df["Value"]=(df["Value"]/1000) # we changed the value because we drop UNIT column where it is writing exact value label

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x="Year", y="Value", data=a)
sns.swarmplot(x="Year", y="Value", data=a)

# Label the axes
plt.xlabel("Years")
plt.ylabel("Value (Million)")

# Yıl aralıklarını oluşturun
years = [str(year) for year in range(min(a["Year"]), max(a["Year"]), 5)]
years.append("2023")

#min_value = round(min(a["Value"]),2)
#max_value = round(max(a["Value"]),2)

step = 0.5  # Belirlediğiniz adım büyüklüğü

#values = np.arange(min_value, max_value, step, dtype=float)
#values = list(np.arange(min_value, max_value + step, step))
asd=math.ceil(round(min(a["Value"]),2)*2)/2
values = list(np.arange(asd, round(max(a["Value"]),2), step))
values.insert(0, round(min(a["Value"]),2))
values.insert(len(values), round(max(a["Value"]),2))
#values = [value for value in range(min_value, max_value, step)]
#np.concatenate(max_value)
#values.append(max_value)
plt.xticks(ticks=years, rotation=90)
plt.yticks(ticks=values)
plt.show()


In [None]:
def ecdf(data):
    n = len(data)
    x = np.sort(data)
    y = np.arange(1, n+1) / n
    return x,y

In [None]:
plt.figure(figsize=(12,6))
percentiles=np.array([10,25,50,75,90])
percentiles_value=np.percentile(a["Value"],percentiles)
x_value,y_value=ecdf(a["Value"])
plt.plot(percentiles_value, percentiles/100, marker='D', color='red',linestyle="none")
plt.plot(x_value,y_value,marker=".",linestyle="none")
plt.xlabel("Value (Million)")
plt.ylabel("ECDF")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
#x_both, y_both = ecdf(df[(df["Sex"]=="Both sexes") & (df["Age Group"]=="All ages")]["Value"])
x_male, y_male = ecdf(df[(df["Sex"]=="Male") & (df["Age Group"]=="All ages")]["Value"])
x_female, y_female = ecdf(df[(df["Sex"]=="Female") & (df["Age Group"]=="All ages")]["Value"])
#plt.plot(x_both, y_both, marker=".", linestyle="none")
plt.plot(x_male, y_male, marker=".", linestyle="none")
plt.plot(x_female, y_female, marker=".", linestyle="none")
#plt.legend(("Both sexes","Male","Female"),loc="lower right")
plt.legend(("Male","Female"),loc="lower right")
plt.xlabel("Value (Million)")
plt.ylabel("ECDF")
#plt.xticks(ticks=values)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
n_data=len(a["Value"])
n_bins=np.sqrt(n_data)
n_bins=int(n_bins)
plt.hist(a["Value"],n_bins)
plt.xlabel("Value")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(12,6))

desired_age_groups = ['0 - 14 years', '15 - 24 years', '25 - 44 years', '45 - 64 years', '65 years and over']
sex_filter = df['Sex'] == 'Both sexes'
age_group_filter = df['Age Group'].isin(desired_age_groups)

filtered_df = df[sex_filter & age_group_filter]

sns.boxplot(x="Age Group", y="Value",data=filtered_df)
sns.swarmplot(x="Age Group", y="Value",data=filtered_df,hue="Year",palette="Spectral")

plt.xlabel("Age Group")
plt.ylabel("Population")
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10,5))

plt.plot(df_male["Value"], df_female["Value"], marker=".", linestyle="none")

plt.xlabel("Male")
plt.ylabel("Female")

plt.show()

In [None]:
from scipy.stats import binom


In [None]:
# İlgili parametreler
n = 10  # Toplam deneme sayısı (kaç kez)
p = 0.2  # Her bir denemede başarılı olma olasılığı

# Belirli bir sayıda başarılı denemelerin olasılığını hesaplayın
x = 3  # Kaç kez başarılı olacağınızı belirtin
probability = binom.pmf(x, n, p)

# Sonucu yazdırın
print(f"2023 yılında '25 - 44 years' yaş grubunda, 'Male' cinsiyette belirli bir olayın {n} kez gerçekleşme olasılığı: {probability:.4f}")


In [None]:
Q1=b["Value"].quantile(0.25)
Q3=b["Value"].quantile(0.75)
IQR=Q3-Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print("IQR: (Million)",IQR)



In [None]:
Q1=b["Value"].quantile(0.25)
Q3=b["Value"].quantile(0.75)
IQR=Q3-Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(IQR)

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = (b["Value"] < lower_bound) | (b["Value"] > upper_bound)
outliers_b = b[outliers]
outliers_list = b["Value"][outliers].tolist()

#outliers_b = b[outliers.any(axis=1)]

outliers_list


In [None]:
sns.boxplot(x="Age Group", y="Value",data=b)
plt.show()

In [None]:
import pandas as pd
from scipy.stats import binom

# Veri setini yükleyin (örneğin, "veri.csv" dosyasını kullanarak)
#data = pd.read_csv("veri.csv")

# Aykırı değer sınırlarını tanımlayın
esik_deger = upper_bound  # Örneğin, aykırı değerler 10'un üzerinde kabul edilebilir

# Belirli bir yıl, yaş grubu ve cinsiyet için aykırı değer olma olasılığını hesaplayın
n = b["Value"].count()  # Toplam gözlem sayısı
p = len(b[b["Value"] > esik_deger]) / n  # Aykırı değer olma olasılığı

# Kaç aykırı değer olması gerektiğini belirtin
x = 7

# Binom dağılımı kullanarak olasılığı hesaplayın
probability = binom.pmf(x, n, p)

# Sonucu yazdır
print(f"Belirli bir yıl, yaş grubu ve cinsiyette {x} aykırı değer olma olasılığı: {probability:.4f}")


In [None]:
# Belirli bir yıl, yaş grubu ve cinsiyet için ortalama olay hızını hesaplayın
ortalama_olay_hizi = b["Value"].mean()

# Kaç nadir olayın olması gerektiğini belirtin
x = 1

# Poisson dağılımı kullanarak olasılığı hesaplayın
probability = poisson.pmf(x, ortalama_olay_hizi)

# Sonucu yazdır
print(f"Belirli bir yıl, yaş grubu ve cinsiyette {x} nadir olayın olasılığı: {probability:.4f}")


In [None]:
ortalama_olay_hizi = b["Value"].mean()

# En az 1 aykırı değer olma olasılığını hesaplayın
x = 0  # En az 1 aykırı değer olmaması gerektiği için x = 0
olasilik = 1 - poisson.cdf(x, ortalama_olay_hizi)
print(f"Belirli bir yıl 1 veya daha fazla aykırı değer olma olasılığı: {olasilik:.4f}")

In [None]:
n

In [None]:
p

In [None]:
upper_bound

In [None]:
np.max(b["Value"])

In [None]:
# Compute the covariance matrix: covariance_matrix
covariance_matrix=np.cov(df_male["Value"],df_female["Value"])

# Print covariance matrix
print(covariance_matrix)

# Extract covariance of length and width of petals: petal_cov
Value_cov=covariance_matrix[0,1]

# Print the length/width covariance
print(Value_cov)


In [None]:
def pearson_r(x,y):
    corr_mat=np.corrcoef(x,y)
    return corr_mat[0,1]

r=pearson_r(df_male["Value"],df_female["Value"])

# Print the result
print(r)

In [None]:
def perform_bernoulli_trials(n, p):
    # Initialize number of successes: n_success
    n_success = 0

    # Perform trials
    for i in range(n):
        # Choose random number between zero and one: random_number
        random_numbers=rng.random()

        # If less than p, it's a success so add one to n_success
        if random_numbers<p:
            n_success+=1

    return n_success

In [None]:
# Instantiate and seed random number generator
rng=np.random.default_rng(seed=42)

# Initialize the number of defaults: n_defaults
n_defaults=np.empty(1000)

# Compute the number of defaults
for i in range(1000):
    n_defaults[i] = perform_bernoulli_trials(100,0.05)


# Plot the histogram with default number of bins; label your axes
_ = plt.hist(n_defaults, density=True)
_ = plt.xlabel('number of defaults out of 100 loans')
_ = plt.ylabel('probability')

# Show the plot
plt.show()

In [None]:
# Compute ECDF: x, y
x,y=ecdf(n_defaults)
# Plot the ECDF with labeled axes
plt.plot(x,y, marker=".",linestyle="none")
plt.xlabel("x")
plt.ylabel("y")


# Show the plot
plt.show()

# Compute the number of 100-loan simulations with 10 or more defaults: n_lose_money
n_lose_money=np.sum(n_defaults>=10)

# Compute and print probability of losing money
print('Probability of losing money =', n_lose_money / len(n_defaults))


In [None]:
# Take 10,000 samples out of the binomial distribution: n_defaults
n_defaults=rng.binomial(n=100,p=0.05,size=10000)

# Compute CDF: x, y
x,y=ecdf(n_defaults)

# Plot the CDF with axis labels
plt.plot(x,y,marker=".",linestyle="none")
plt.xlabel("the number of defaults out of 100 loans")
plt.ylabel("CDF")

# Show the plot
plt.show()


In [None]:
# Compute bin edges: bins
bins = np.arange(0, max(n_defaults) + 1.5) - 0.5

# Generate histogram
plt.hist(n_defaults, bins=bins,density=True)

# Label axes
plt.xlabel("x")
plt.ylabel("y")

# Show the plot
plt.show()

In [None]:
# Draw 10,000 samples out of Poisson distribution: samples_poisson
samples_poisson=rng.poisson(10,size=10000)

# Print the mean and standard deviation
print('Poisson:     ', np.mean(samples_poisson),
                       np.std(samples_poisson))

# Specify values of n and p to consider for Binomial: n, p
n=[20,100,1000]
p=[0.5,0.1,0.01]

# Draw 10,000 samples for each n,p pair: samples_binomial
for i in range(3):
    samples_binomial = rng.binomial(n[i],p[i],size=10000)

    # Print results
    print('n =', n[i], 'Binom:', np.mean(samples_binomial),
                                 np.std(samples_binomial))


In [None]:
# Draw 10,000 samples out of Poisson distribution: n_nohitters
n_nohitters=rng.poisson(251/115,size=10000)

# Compute number of samples that are seven or greater: n_large
n_large = np.sum(n_nohitters>=7)

# Compute probability of getting seven or more: p_large
p_large=n_large/10000

# Print the result
print('Probability of seven or more no-hitters:', p_large)


In [None]:
# Draw 100000 samples from Normal distribution with stds of interest: samples_std1, samples_std3, samples_std10
samples_std1=rng.normal(20,1,size=100000)
samples_std3=rng.normal(20,3,size=100000)
samples_std10=rng.normal(20,10,size=100000)

# Make histograms
plt.hist(samples_std1,density=True,histtype="step",bins=100)
plt.hist(samples_std3,density=True,histtype="step",bins=100)
plt.hist(samples_std10,density=True,histtype="step",bins=100)

# Make a legend, set limits and show plot
_ = plt.legend(('std = 1', 'std = 3', 'std = 10'))
plt.ylim(-0.01, 0.42)
plt.show()


In [None]:
# Generate CDFs
x_std1, y_std1=ecdf(samples_std1)
x_std3, y_std3=ecdf(samples_std3)
x_std10, y_std10=ecdf(samples_std10)



# Plot CDFs
plt.plot(x_std1, y_std1, marker=".",linestyle="none")
plt.plot(x_std3, y_std3, marker=".",linestyle="none")
plt.plot(x_std10, y_std10, marker=".",linestyle="none")
# Make a legend and show the plot
_ = plt.legend(('std = 1', 'std = 3', 'std = 10'), loc='lower right')
plt.show()


In [None]:
# Compute mean and standard deviation: mu, sigma
mu=np.mean(a["Value"])
sigma=np.std(a["Value"])

# Sample out of a normal distribution with this mu and sigma: samples
samples=rng.normal(mu,sigma,size=10000)

# Get the CDF of the samples and of the data
x_theor, y_theor=ecdf(samples)
x,y = ecdf(a["Value"])

# Plot the CDFs and show the plot
_ = plt.plot(x_theor, y_theor)
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('Value (Million)')
_ = plt.ylabel('CDF')
plt.show()


In [None]:
# Take a million samples out of the Normal distribution: samples
samples=rng.normal(mu,sigma,size=1000000)

# Compute the fraction that are faster than 144 seconds: prob
prob=np.sum(samples<=144)/len(samples)

# Print the result
print('Probability of besting Secretariat:', prob)


In [None]:
def successive_poisson(tau1, tau2, size=1):
    """Compute time for arrival of 2 successive Poisson processes."""
    # Draw samples out of first exponential distribution: t1
    t1 = rng.exponential(tau1,size)

    # Draw samples out of second exponential distribution: t2
    t2 = rng.exponential(tau2,size)

    return t1 + t2

In [None]:
# Draw samples of waiting times: waiting_times
waiting_times= successive_poisson(764,715,size=100000)

# Make the histogram
plt.hist(waiting_times,density=True,histtype="step",bins=100)


# Label axes
plt.xlabel("time")
plt.ylabel("y")

# Show the plot
plt.show()


In [None]:

sns.boxplot(x=a["Year"], y=a["Value"])

plt.show()


In [None]:
sns.boxplot(x=a["Year"],y=a["Value"]);

In [None]:
sns.boxplot(x=df["Value"])

In [None]:
Q1=b["Value"].quantile(0.25)
Q3=b["Value"].quantile(0.75)
IQR=Q3-Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(IQR)

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = (b["Value"] < lower_bound) | (b["Value"] > upper_bound)
outliers_b = b[outliers]

outliers_b = b[outliers.any(axis=1)]

outliers_b

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
outliers = (b["Value"] < lower_bound) | (b["Value"] > upper_bound)

In [None]:
outliers_b = b[outliers]

In [None]:
outliers = (df['Value'] < lower_bound) | (df['Value'] > upper_bound)
outliers_df = df[outliers]


In [None]:
outliers_df

In [None]:
outliers_b = b[outliers.any(axis=1)]

In [None]:
outliers_b

In [None]:
df

In [None]:
df.hist(column="Value",bins=20)

In [None]:
df["Age Group"].unique()

In [None]:
Age_Group_Value_Counts=pd.crosstab(index=df["Age Group"],columns="Value")

In [None]:
Age_Group_Value_Counts

In [None]:
from numpy import median, mean

In [None]:
sns.set(style="whitegrid")

In [None]:
plt.figure(figsize=(12,6))
ax = sns.barplot(x="Age Group", y="Value", hue="Age Group", data=df)

plt.xticks(rotation=90)
plt.legend(loc="upper right",bbox_to_anchor=(1.2,1))

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(data=df,x="Age Group",y="Value",hue="Sex");
plt.xticks(rotation=90);
plt.legend(loc="upper right",bbox_to_anchor=(1.2,1))

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(data=df,x="Year",y="Value",hue="Age Group",palette="Spectral");
plt.ylabel("Value (Thousand)")
plt.xticks(rotation=90);
plt.legend(loc="upper right",bbox_to_anchor=(1.2,1));

In [None]:
import math
subset_0_14_years=df[(df["Age Group"]=="0 - 14 years")]
print(subset_0_14_years,"\n\n")
print("Minimum value:\n",subset_0_14_years.min(),"\n")
print("Maximum value:\n",subset_0_14_years.max(),"\n")

In [None]:
#min_value=(subset_0_14_years["Value"].apply(math.ceil).min())-1
#max_value=subset_0_14_years["Value"].max()
min_value=subset_0_14_years["Value"].apply(lambda x: (math.ceil(x/100)*100)-100).min()
max_value=subset_0_14_years["Value"].apply(lambda x: math.ceil(x/100)*100).max()
plt.figure(figsize=(12,6))
sns.scatterplot(data=subset_0_14_years,x="Year",y="Value",hue="Sex");
plt.xticks(rotation=90);
plt.ylabel("Value (Thousand)")
plt.yticks(np.arange(min_value, max_value, step=30))
plt.legend(loc="upper right",bbox_to_anchor=(1.16,1.02));

In [None]:
#min_value=(subset_0_14_years["Value"].apply(math.ceil).min())-1
max_value=subset_0_14_years["Value"].max()
min_value=subset_0_14_years["Value"].apply(lambda x: (math.ceil(x/100)*100)-100).min()
plt.figure(figsize=(12,6))
sns.scatterplot(data=subset_0_14_years,x="Year",y="Value",hue="Sex");
plt.xticks(rotation=90);
plt.ylabel("Value (Thousand)")

plt.yticks(np.arange(min_value, max_value, step=30))
plt.legend(loc="upper right",bbox_to_anchor=(1.16,1.02));

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(data=subset_0_14_years,x="Year",y="Value",hue="Sex");
plt.xticks(rotation=90);
plt.ylabel("Value (Thousand)")
plt.legend(loc="upper right",bbox_to_anchor=(1.16,1.02));

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax.scatter(subset_0_14_years["Year"], subset_0_14_years["Value"])
ax.set_title("Value By Years")
ax.set_xlabel("Year")
ax.set_ylabel("Value")
plt.show()

In [None]:
subset_0_4_years=df[(df["Age Group"]=="0 - 4 years") & (df["Sex"]=="Male")]
print(subset_0_4_years.head(),"\n\n")
print("Minimum value:\n",subset_0_4_years.min(),"\n")
print("Maximum value:\n",subset_0_4_years.max(),"\n")

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df["Age Group"], df["Value"])
ax.set_xlabel("Age Group")
plt.xticks(rotation=90)
ax.set_ylabel("Value")
plt.title("Value By Age Group")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df["Age Group"]=="0 - 14 years", df["Value"])
ax.set_xlabel("Age Group")
ax.set_ylabel("Value")
plt.show()

In [None]:

# # Kategorik verileri sayısal olarak kodlayalım
# df['Age Group Code'] = pd.Categorical(df['Age Group']).codes

# # Sütunları x, y ve renk olarak kullanarak scatter plot oluşturun
# fig, ax = plt.subplots(figsize=(12, 10))
# scatter = ax.scatter(df["Year"], df["Value"], c=df["Age Group Code"], cmap='viridis', marker='o')

# # Renk skalasını oluşturun ve eksen etiketlerini ayarlayın
# cbar = plt.colorbar(scatter)
# cbar.set_label("Age Group")
# ax.set_xlabel("Year")
# ax.set_ylabel("Value")
# plt.show()




In [None]:
# df["Age Group"].unique()

In [None]:
# # Belirlediğiniz renk listesi
# colors = ['red', 'green', 'blue', 'orange', 'purple']

# # Her bir "Age Group" kategorisi için "VALUE" sütununun ortalamasını hesaplayalım
# grouped_data = df.groupby('Age Group')['Value'].mean()

# # Çubuk grafik oluşturalım ve belirlediğiniz renkleri kullanalım
# fig, ax = plt.subplots(figsize=(10, 6))
# grouped_data.plot(kind='bar', ax=ax, color=colors)
# ax.set_xlabel("Age Group")
# ax.set_ylabel("Average Value")
# plt.show()

In [None]:
# # Kategorik verileri sayısal olarak kodlayalım
# age_group_order = [
#     'Under 1 year', '0 - 4 years', '0 - 14 years', '1 - 4 years', '5 - 9 years', '10 - 14 years',
#     '15 - 19 years', '15 - 24 years', '15 years and over', '20 - 24 years',
#     '25 - 29 years', '25 - 44 years', '30 - 34 years', '35 - 39 years', '40 - 44 years',
#     '45 - 49 years', '45 - 64 years', '50 - 54 years', '55 - 59 years',
#     '60 - 64 years', '65 - 69 years', '65 years and over', '70 - 74 years',
#     '75 - 79 years', '80 - 84 years', '85 years and over', 'All ages'
# ]

# df['Age Group'] = pd.Categorical(df['Age Group'], categories=age_group_order, ordered=True)
# df['Age Group Code'] = df['Age Group'].cat.codes

# # Sütunları x, y ve renk olarak kullanarak scatter plot oluşturun
# fig, ax = plt.subplots(figsize=(12, 10))
# scatter = ax.scatter(df["Year"], df["Value"], c=df["Age Group Code"], cmap='viridis', marker='o')

# # Renk skalasını oluşturun ve eksen etiketlerini ayarlayın
# cbar = plt.colorbar(scatter)
# cbar.set_label("Age Group")
# ax.set_xlabel("Year")
# ax.set_ylabel("Value")
# plt.show()