In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# 1) Read the CSV without headers
path_to_csv = r'../Data/Recidiv9.csv'
df = pd.read_csv(path_to_csv, sep=';', header=None, dtype=str)

# 2) Compute how many period columns there are
n_cols = df.shape[1]
n_periods = n_cols - 4  # First 4 columns are dimensions

# 3) Generate period labels from 2008:2010 up to (2008 + n_periods -1):(2010 + n_periods -1)
start_year = 2008
period_labels = [
    f"{year}:{year+2}"
    for year in range(start_year, start_year + n_periods)
]

# 4) Assign column names
df.columns = [
    "Recidivh√¶ndelser_code",
    "Uddannelse_code",
    "K√∏n_code",
    "Alder_code"
] + period_labels

# 5) Convert period columns to numeric (optional but recommended before melting if you want to analyze in wide format)
for col in period_labels:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# 6) Label encode all object columns (optional, only for ML use)
data_column_category = df.select_dtypes(include="object").columns
df_label_encoded = df.copy()
for col in data_column_category:
    le = LabelEncoder()
    df_label_encoded[col] = le.fit_transform(df[col])

# 7) Melt all period columns into long form
df_long = df.melt(
    id_vars=["Recidivh√¶ndelser_code", "Uddannelse_code", "K√∏n_code", "Alder_code"],
    value_vars=period_labels,
    var_name="Periode",
    value_name="Antal"
)

# 8) Optional: Map codes to readable labels
recid_map = {
    "106": "Ingen tilbagefald",
    "107": "1 tilbagefald",
    "108": "2 tilbagefald",
    "109": "3 tilbagefald",
    "110": "4-9 tilbagefald",
    "111": "10 eller flere tilbagefald",
}
udd_map = {
    "10": "Grundskole",
    "20": "Gymnasial uddannelse",
    "35": "Erhvervsuddannelse",
    "40": "Videreg√•ende uddannelse",
    "00": "Uoplyst uddannelse"
}
gender_map = {
    "M": "M√¶nd",
    "K": "Kvinder"
}
age_map = {
    "1519": "15‚Äì19 √•r",
    "2024": "20‚Äì24 √•r",
    "2529": "25‚Äì29 √•r",
    "3034": "30‚Äì34 √•r",
    "3539": "35‚Äì39 √•r",
    "4049": "40‚Äì49 √•r",
    "5059": "50‚Äì59 √•r",
    "6099": "60+ √•r"
}

df_long["Recidivh√¶ndelser"] = df_long["Recidivh√¶ndelser_code"].map(recid_map)
df_long["Uddannelse"] = df_long["Uddannelse_code"].map(udd_map)
df_long["K√∏n"] = df_long["K√∏n_code"].map(gender_map)
df_long["Alder"] = df_long["Alder_code"].map(age_map)

# 9) Format and type conversion
df_long["Periode"] = df_long["Periode"].str.replace(":", "‚Äì")  # e.g. "2008:2010" ‚Üí "2008‚Äì2010"
df_long["Antal"] = pd.to_numeric(df_long["Antal"], errors="coerce")

# 10) Drop code columns
df_long = df_long.drop(columns=[
    "Recidivh√¶ndelser_code",
    "Uddannelse_code",
    "K√∏n_code",
    "Alder_code"
])

# 11) Basic checks
print("\nüßæ Preview of the first 10 rows:")
print(df_long.head(10))

print("\nüìã DataFrame structure:")
df_long.info()

print("\nüîç Data types:")
print(df_long.dtypes)

print("\nüìä Statistical summary of 'Antal':")
print(df_long["Antal"].describe())

# 12) Export cleaned DataFrame
df_long.to_csv("../Data/CleanedRecidiv9.csv", index=False)
print(f"\n‚úÖ Exported cleaned data with shape: {df_long.shape}")

# Optional: Preview random sample from the original dataset
print("\nüé≤ Sample from original dataset:")
print(df.sample(10))



üßæ Preview of the first 10 rows:
     Periode  Antal   Recidivh√¶ndelser  Uddannelse      K√∏n     Alder
0  2008‚Äì2010   3502  Ingen tilbagefald  Grundskole     M√¶nd  15‚Äì19 √•r
1  2008‚Äì2010   2908  Ingen tilbagefald  Grundskole     M√¶nd  20‚Äì24 √•r
2  2008‚Äì2010   2126  Ingen tilbagefald  Grundskole     M√¶nd  25‚Äì29 √•r
3  2008‚Äì2010   1996  Ingen tilbagefald  Grundskole     M√¶nd  30‚Äì34 √•r
4  2008‚Äì2010   2185  Ingen tilbagefald  Grundskole     M√¶nd  35‚Äì39 √•r
5  2008‚Äì2010   4224  Ingen tilbagefald  Grundskole     M√¶nd  40‚Äì49 √•r
6  2008‚Äì2010   2580  Ingen tilbagefald  Grundskole     M√¶nd  50‚Äì59 √•r
7  2008‚Äì2010   2159  Ingen tilbagefald  Grundskole     M√¶nd    60+ √•r
8  2008‚Äì2010   1155  Ingen tilbagefald  Grundskole  Kvinder  15‚Äì19 √•r
9  2008‚Äì2010    805  Ingen tilbagefald  Grundskole  Kvinder  20‚Äì24 √•r

üìã DataFrame structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6240 entries, 0 to 6239
Data columns (total 6 columns):
 # 