In [1]:
import numpy as np 
import pandas as pd 
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

In [2]:
# initialize spark session
spark = SparkSession.builder \
            .master("local[*]") \
            .appName("ShortNSimple") \
            .getOrCreate()
spark

In [3]:
# read the Wine dataset
wine_data = spark.createDataFrame(pd.read_csv("datasets/wine.csv"))
wine_data.show(5, False)

+-----+------------+---------------+--------+-----------------------+--------------+-----------------+---------------+-------------------------+--------------------+-------------------+--------+---------------------------------+------------------------+
|Class| 	1) Alcohol| 	2) Malic acid| 	3) Ash|	4) Alcalinity of ash  | 	5) Magnesium|	6) Total phenols| 	7) Flavanoids| 	8) Nonflavanoid phenols| 	9) Proanthocyanins|	10)Color intensity| 	11)Hue| 	12)OD280/OD315 of diluted wines| 	13)Proline            |
+-----+------------+---------------+--------+-----------------------+--------------+-----------------+---------------+-------------------------+--------------------+-------------------+--------+---------------------------------+------------------------+
|1    |14.23       |1.71           |2.43    |15.6                   |127           |2.8              |3.06           |0.28                     |2.29                |5.64               |1.04    |3.92                             |1065      

In [4]:
# print the column name
print(wine_data.columns)

['Class', ' \t1) Alcohol', ' \t2) Malic acid', ' \t3) Ash', '\t4) Alcalinity of ash  ', ' \t5) Magnesium', '\t6) Total phenols', ' \t7) Flavanoids', ' \t8) Nonflavanoid phenols', ' \t9) Proanthocyanins', '\t10)Color intensity', ' \t11)Hue', ' \t12)OD280/OD315 of diluted wines', ' \t13)Proline            ']


## Rename a column

In [12]:
wine_data = wine_data.withColumnRenamed(
    "class_labels",
    "0) class_labels"
)
print(wine_data.columns)

['0) class_labels', ' \t1) Alcohol', ' \t2) Malic acid', ' \t3) Ash', '\t4) Alcalinity of ash  ', ' \t5) Magnesium', '\t6) Total phenols', ' \t7) Flavanoids', ' \t8) Nonflavanoid phenols', ' \t9) Proanthocyanins', '\t10)Color intensity', ' \t11)Hue', ' \t12)OD280/OD315 of diluted wines', ' \t13)Proline            ']


In [13]:
old_columns = wine_data.columns

In [14]:
' \t1) Alcohol'.strip('\t ').split(')')[1].strip()

'Alcohol'

In [17]:
new_column_name = [each_col.strip('\t ').split(')')[1].strip().lower().replace(' ', '_') for each_col in old_columns]
print(new_column_name)

['class_labels', 'alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


## Rename multiple columns

In [19]:
for oldcol, newcol in zip(old_columns, new_column_name):
    wine_data = wine_data.withColumnRenamed(oldcol, newcol)

print(wine_data.columns)

['class_labels', 'alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [20]:
wine_data.show(5, False)

+------------+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+
|class_labels|alcohol|malic_acid|ash |alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity|hue |od280/od315_of_diluted_wines|proline|
+------------+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+
|1           |14.23  |1.71      |2.43|15.6             |127      |2.8          |3.06      |0.28                |2.29           |5.64           |1.04|3.92                        |1065   |
|1           |13.2   |1.78      |2.14|11.2             |100      |2.65         |2.76      |0.26                |1.28           |4.38           |1.05|3.4                         |1050   |
|1           |13.16  |2.36      |2.67|18.6             |101      