In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 1) Read the CSV without headers
path_to_csv = r'..\Data\Recidiv10\Recidiv10.csv'
df = pd.read_csv(path_to_csv, sep=';', header=None, dtype=str)

# 2) Compute how many period columns there are
n_cols = df.shape[1]
n_periods = n_cols - 5  # First 4 columns are dimensions

# 3) Generate period labels from 2008:2010 up to (2008 + n_periods -1):(2010 + n_periods -1)
start_year = 2009
period_labels = [
    f"{year}:{year+2}"
    for year in range(start_year, start_year + n_periods)
]

# 4) Assign column names
df.columns = [
    "Varighed_til_tilbagefald_code",
    "Recidivh√¶ndelser_code",
    "Tidligere_domme_code",
    "K√∏n_code",
    "Alder_code"
] + period_labels

# 5) Convert period columns to numeric (optional but recommended before melting if you want to analyze in wide format)
for col in period_labels:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# 6) Label encode all object columns (optional, only for ML use)
data_column_category = df.select_dtypes(include="object").columns
df_label_encoded = df.copy()
for col in data_column_category:
    le = LabelEncoder()
    df_label_encoded[col] = le.fit_transform(df[col])

# 7) Melt all period columns into long form
df_long = df.melt(
    id_vars=["Recidivh√¶ndelser_code", "K√∏n_code", "Alder_code", "Tidligere_domme_code", "Varighed_til_tilbagefald_code"],
    value_vars=period_labels,
    var_name="Periode",
    value_name="Antal"
)

# 8) Optional: Map codes to readable labels
recid_map = {
    "106": "Ingen tilbagefald",
    "107": "1 tilbagefald",
    "108": "2 tilbagefald",
    "109": "3 tilbagefald",
    "110": "4-9 tilbagefald",
    "111": "10 eller flere tilbagefald",
}
previous_conviction_map = {
    "0": "Ingen tidligere domme",
    "1": "1 Tidligere dom",
    "2": "2 Tidligere domme",
    "3": "3 Tidligere domme",
    "4": "4 Tidligere domme",
    "5": "5-9 Tidligere domme",
    "10": "10 eller flere tidligere domme",
}
var_duration_map = {
    "0": "Ingen tilbagefald",
    "6": "I l√∏bet af 6 m√•neder",
    "712": "Efter 6 m√•neder og indenfor 1 √•r",
    "1324": "Efter 1 √•r og indenfor 2 √•r",
}

gender_map = {
    "M": "M√¶nd",
    "K": "Kvinder"
}
age_map = {
    "1519": "15‚Äì19 √•r",
    "2024": "20‚Äì24 √•r",
    "2529": "25‚Äì29 √•r",
    "3034": "30‚Äì34 √•r",
    "3539": "35‚Äì39 √•r",
    "4049": "40‚Äì49 √•r",
    "5059": "50‚Äì59 √•r",
    "6099": "60+ √•r"
}

df_long["Recidivh√¶ndelser"] = df_long["Recidivh√¶ndelser_code"].map(recid_map)
df_long["Tidligere_domme"] = df_long["Tidligere_domme_code"].map(previous_conviction_map)
df_long["Varighed_til_tilbagefald"] = df_long["Varighed_til_tilbagefald_code"].map(var_duration_map)
df_long["K√∏n"] = df_long["K√∏n_code"].map(gender_map)
df_long["Alder"] = df_long["Alder_code"].map(age_map)

# 9) Format and type conversion
df_long["Periode"] = df_long["Periode"].str.replace(":", "‚Äì")  # e.g. "2008:2010" ‚Üí "2008‚Äì2010"
df_long["Antal"] = pd.to_numeric(df_long["Antal"], errors="coerce")

# 10) Drop code columns
df_long = df_long.drop(columns=[
    "Recidivh√¶ndelser_code",
    "Tidligere_domme_code",
    "Varighed_til_tilbagefald_code",
    "K√∏n_code",
    "Alder_code"
])

# 11) Basic checks
print("\nüßæ Preview of the first 10 rows:")
print(df_long.head(10))

print("\nüìã DataFrame structure:")
df_long.info()

print("\nüîç Data types:")
print(df_long.dtypes)

print("\nüìä Statistical summary of 'Antal':")
print(df_long["Antal"].describe())

# 12) Export cleaned DataFrame
df_long.to_csv("../Data/CleanedRecidiv10.csv", index=False)
print(f"\n‚úÖ Exported cleaned data with shape: {df_long.shape}")

# Optional: Preview random sample from the original dataset
print("\nüé≤ Sample from original dataset:")
print(df.sample(10))


üßæ Preview of the first 10 rows:
     Periode  Antal   Recidivh√¶ndelser        Tidligere_domme  \
0  2009‚Äì2011   4535  Ingen tilbagefald  Ingen tidligere domme   
1  2009‚Äì2011   3542  Ingen tilbagefald  Ingen tidligere domme   
2  2009‚Äì2011   4005  Ingen tilbagefald  Ingen tidligere domme   
3  2009‚Äì2011   4597  Ingen tilbagefald  Ingen tidligere domme   
4  2009‚Äì2011   9456  Ingen tilbagefald  Ingen tidligere domme   
5  2009‚Äì2011   6416  Ingen tilbagefald  Ingen tidligere domme   
6  2009‚Äì2011   5462  Ingen tilbagefald  Ingen tidligere domme   
7  2009‚Äì2011   2000  Ingen tilbagefald  Ingen tidligere domme   
8  2009‚Äì2011   1871  Ingen tilbagefald  Ingen tidligere domme   
9  2009‚Äì2011   2236  Ingen tilbagefald  Ingen tidligere domme   

  Varighed_til_tilbagefald      K√∏n     Alder  
0        Ingen tilbagefald     M√¶nd  20‚Äì24 √•r  
1        Ingen tilbagefald     M√¶nd  25‚Äì29 √•r  
2        Ingen tilbagefald     M√¶nd  30‚Äì34 √•r  
3        Ingen tilbage

In [6]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df_long.copy()
categorical_cols = df_encoded.select_dtypes(include='object').columns

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

df_long.to_csv("../Data/Recidiv10/CleanedRecidiv10.csv", index=False)
df_encoded.to_csv("../Data/Recidiv10/EncodedRecidiv9.csv", index=False)  # Optional export
print(f"\n‚úÖ Exported cleaned data with shape: {df_long.shape}")
print(f"‚úÖ Exported encoded data with shape: {df_encoded.shape}")


‚úÖ Exported cleaned data with shape: (28224, 7)
‚úÖ Exported encoded data with shape: (28224, 7)
