In [1]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# EDA – WUR (Kap. 6)\n",
    "Tento notebook pokrývá kapitolu 6: deskriptivní statistiku, trendové grafy, korelace a předběžnou identifikaci klíčových faktorů."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports a načtení datasetu\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from pathlib import Path\n",
    "\n",
    "DATA_PATH = Path(\"data/clean/wur_dataset.parquet\")\n",
    "\n",
    "df = pd.read_parquet(DATA_PATH)\n",
    "print(\"Počet řádků:\", len(df))\n",
    "print(\"Sloupce:\", list(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.1 Deskriptivní statistika"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Základní deskriptivní statistika pro číselné sloupce\n",
    "numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()\n",
    "desc = df[numeric_cols].describe().T\n",
    "desc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.2 Vizualizace trendů"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pomocná funkce: vykreslení časového vývoje metriky pro vybranou univerzitu\n",
    "def plot_university_trend(university_name, metric=\"overall_score\"):\n",
    "    d = df[df[\"university\"] == university_name].sort_values(\"year\")\n",
    "    if d.empty:\n",
    "        print(\"Nenalezeny záznamy pro univerzitu:\", university_name)\n",
    "        return\n",
    "    plt.figure()\n",
    "    plt.plot(d[\"year\"], d[metric], marker=\"o\")\n",
    "    plt.title(f\"{university_name} – trend: {metric}\")\n",
    "    plt.xlabel(\"Rok\")\n",
    "    plt.ylabel(metric)\n",
    "    plt.grid(True)\n",
    "    plt.show()\n",
    "\n",
    "# Příklad: vyber první univerzitu z datasetu\n",
    "example_university = df[\"university\"].dropna().unique()[0]\n",
    "plot_university_trend(example_university, metric=\"overall_score\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Agregované trendy podle zemí – průměrné skóre za rok (Top N zemí dle počtu univerzit)\n",
    "N = 5\n",
    "country_counts = df.groupby(\"country\")[\"university\"].nunique().sort_values(ascending=False)\n",
    "top_countries = country_counts.head(N).index.tolist()\n",
    "\n",
    "agg = (df[df[\"country\"].isin(top_countries)]\n",
    "       .groupby([\"country\",\"year\"])[\"overall_score\"]\n",
    "       .mean()\n",
    "       .reset_index())\n",
    "\n",
    "plt.figure()\n",
    "for c in top_countries:\n",
    "    d = agg[agg[\"country\"] == c].sort_values(\"year\")\n",
    "    plt.plot(d[\"year\"], d[\"overall_score\"], marker=\"o\", label=c)\n",
    "plt.title(\"Průměrné celkové skóre – top země podle počtu univerzit\")\n",
    "plt.xlabel(\"Rok\")\n",
    "plt.ylabel(\"overall_score\")\n",
    "plt.grid(True)\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.3 Korelace indikátorů"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Korelační matice pro hlavní indikátory\n",
    "indicators = [\"overall_score\",\"teaching\",\"research\",\"citations\",\"industry_income\",\"international_outlook\"]\n",
    "indicators = [col for col in indicators if col in df.columns]\n",
    "corr = df[indicators].corr(method=\"pearson\")\n",
    "corr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Jednoduchá heatmapa korelací pomocí matplotlib (bez seabornu)\n",
    "fig, ax = plt.subplots()\n",
    "cax = ax.imshow(corr.values, aspect='auto')\n",
    "ax.set_xticks(range(len(corr.columns)))\n",
    "ax.set_xticklabels(corr.columns, rotation=45, ha=\"right\")\n",
    "ax.set_yticks(range(len(corr.index)))\n",
    "ax.set_yticklabels(corr.index)\n",
    "fig.colorbar(cax)\n",
    "ax.set_title(\"Korelační matice\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.4 Předběžná identifikace klíčových faktorů"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Korelace indikátorů s celkovým skóre a s rankem (pokud je rank dostupný)\n",
    "target_cols = []\n",
    "if \"overall_score\" in df.columns:\n",
    "    target_cols.append(\"overall_score\")\n",
    "if \"rank\" in df.columns:\n",
    "    target_cols.append(\"rank\")\n",
    "\n",
    "features = [c for c in df.columns if c not in [\"year\",\"university\",\"country\"]]\n",
    "num_features = df[features].select_dtypes(include=[np.number]).columns.tolist()\n",
    "\n",
    "results = {}\n",
    "for target in target_cols:\n",
    "    corrs = df[num_features].corrwith(df[target]).sort_values(ascending=False)\n",
    "    results[target] = corrs.to_frame(name=f\"corr_with_{target}\")\n",
    "\n",
    "# Výpis\n",
    "for target, corr_df in results.items():\n",
    "    display(corr_df)\n",
    "\n",
    "# Top 10 podle korelace s overall_score (pokud existuje)\n",
    "if \"overall_score\" in results:\n",
    "    top10 = results[\"overall_score\"].dropna().abs().sort_values(ascending=False).head(10)\n",
    "    print(\"\\nTop 10 absolutních korelací s overall_score:\")\n",
    "    print(top10)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


NameError: name 'null' is not defined