In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

In [8]:
#Import the Non-League football CSV file containing teams, their league position, their promotion status and their twitter followers
football = pd.read_csv("football_data_final_edit.csv")
#Drop null rows which erroneously came with the CSV 
football.drop([0,1], inplace=True)
#Drop null columns which erroneously came with the CSV
football.drop(["Unnamed: 25","Unnamed: 26","Unnamed: 27","Unnamed: 28"],inplace=True,axis=1)
#Since we dropped some rows, we need to reset the index of the dataframe
football = football.reset_index(drop=True)
#The column headings came from the CSV as the first row, redefine the headings correctly and strip any whitespace with lambda func
strip_whitespace = lambda x: x.strip() 
column_headings = list(map(strip_whitespace,list(football.iloc[0])))
football.columns = column_headings
#Remove the now un-needed dupiclate row containing the column headings and reset the index again.
football.drop(football[football["Step"]=="0"].index,inplace=True)
football.drop([0],inplace=True)
football = football.reset_index(drop=True)
#Let's trim any whitespace from entries and column headings
football =football.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
football.head()
football["Positional Improvement"] = np.where(football["2022-23\nPosition"] < football["2021-22\nPosition"],
                                              np.where(football["2021-22 Promoted"] == 1, 1, np.where(football["2021-22 Promoted"] == 2, 1,0)), 
                                              np.where(football["2021-22 Promoted"] != 2 , 0, 1))
football

Unnamed: 0,Step,League,2021-22\nPosition,2021-22 Promoted,2022-23\nPosition,2022-23 Promoted,Teams,06.12.18,17.01.19,26.03.19,...,04.04.21,22.01.22,27.05.22,29.06.22,06.10.22,18.12.22,19.04.23,15.07.23,Gained between 19/4/23 & 29/6/22,Positional Improvement
0,2,National League South,1,2,4,2,Aveley,7508,8332,8529,...,10512,11122,11511,11600,11830,11950,12261,12812,661,0
1,3,Isthmian Premier League,2,2,1,2,Chatham Town,,,4575,...,6475,7476,7998,8153,,8761,9247,9541,1094,0
2,3,Northern Premier Division,1,2,1,2,Macclesfield Town,,,23658,...,28617,38189,40507,40923,,42850,44038,44193,3115,0
3,3,Southern Premier Div. Central,1,2,4,2,Long Eaton United,,,4419,...,5666,6229,6591,6648,,7022,7233,7609,585,0
4,3,Southern Premier Div. South,2,2,2,2,Walton & Hersham,,,4369,...,5621,7110,8197,8330,,10032,10948,12746,2618,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1002,7,,19,,20,,Rocester,,,3317,...,,4386,,4579,,4711,4827,,1510,0
1003,,,,,,,,,,,...,,,,,,,,,,0
1004,7,,14,,20,,Thame United Reserves,,,223,...,,226,,518,,534,545,,322,0
1005,7,,11,,,,Walton Casuals - dissolved,4513,4637,4819,...,5873,6823,7032,7082,6998,6955,6862,,2043,0


In [16]:
#For loop to determine whether team improved on their ranking from last season. 2021/22 - 2022/23
positional_improvement = []
for i in range(0,len(football)):
    if float(football.iloc[i]["2022-23\nPosition"]) < float(football.iloc[i]["2021-22\nPosition"]):
        if float(football.iloc[i]["2021-22 Promoted"]) == 1:
            positional_improvement.append(1)
        elif float(football.iloc[i]["2021-22 Promoted"]) == 2:
            positional_improvement.append(1)
        else: 
            positional_improvement.append(0)
    else:
        if float(football.iloc[i]["2021-22 Promoted"]) == 2: 
            positional_improvement.append(1)
        else:
            positional_improvement.append(0)
#Set this created list to be the new column "Positional Improvement"
football["Positional Improvement"] = positional_improvement
football

Unnamed: 0,Step,League,2021-22\nPosition,2021-22 Promoted,2022-23\nPosition,2022-23 Promoted,Teams,06.12.18,17.01.19,26.03.19,...,04.04.21,22.01.22,27.05.22,29.06.22,06.10.22,18.12.22,19.04.23,15.07.23,Gained between 19/4/23 & 29/6/22,Positional Improvement
0,2,National League South,1,2,4,2,Aveley,7508,8332,8529,...,10512,11122,11511,11600,11830,11950,12261,12812,661,1
1,3,Isthmian Premier League,2,2,1,2,Chatham Town,,,4575,...,6475,7476,7998,8153,,8761,9247,9541,1094,1
2,3,Northern Premier Division,1,2,1,2,Macclesfield Town,,,23658,...,28617,38189,40507,40923,,42850,44038,44193,3115,1
3,3,Southern Premier Div. Central,1,2,4,2,Long Eaton United,,,4419,...,5666,6229,6591,6648,,7022,7233,7609,585,1
4,3,Southern Premier Div. South,2,2,2,2,Walton & Hersham,,,4369,...,5621,7110,8197,8330,,10032,10948,12746,2618,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1002,7,,19,,20,,Rocester,,,3317,...,,4386,,4579,,4711,4827,,1510,0
1003,,,,,,,,,,,...,,,,,,,,,,0
1004,7,,14,,20,,Thame United Reserves,,,223,...,,226,,518,,534,545,,322,0
1005,7,,11,,,,Walton Casuals - dissolved,4513,4637,4819,...,5873,6823,7032,7082,6998,6955,6862,,2043,0


In [17]:
#Drop rows which are step 0 as these aren't non-league teams.
football = football.sort_values(by=["Step"])
#Sort values in dataframe by the step, then drop NaN values from league and step as these are not compatible with later analysis
football.dropna(subset =["Step","League","Gained between 19/4/23 & 29/6/22"],how="any",inplace=True)
#Reset the index based on the sorted table
football = football.reset_index(drop=True)
#Remove commas from dataframe values
football = football.replace(",","",regex=True)
#print table
football

Unnamed: 0,Step,League,2021-22\nPosition,2021-22 Promoted,2022-23\nPosition,2022-23 Promoted,Teams,06.12.18,17.01.19,26.03.19,...,04.04.21,22.01.22,27.05.22,29.06.22,06.10.22,18.12.22,19.04.23,15.07.23,Gained between 19/4/23 & 29/6/22,Positional Improvement
0,1,National League,15,1,4,1,Woking,16328,16948,17471,...,21065,22526,23403,23589,24308,24970,26117,26546,2528,1
1,1,National League,20,1,18,1,Aldershot Town,35129,35317,35481,...,37892,39369,40105,40330,41004,41402,42451,42770,2121,1
2,1,National League,19,1,9,1,Eastleigh,22231,22456,22830,...,28655,30408,31501,31601,32350,33045,34255,34512,2654,1
3,1,National League,18,1,5,1,Barnet,29736,30336,31292,...,34864,36677,37524,37665,38521,39091,40470,40747,2805,1
4,1,National League,17,1,20,1,Maidenhead United,9784,9957,10207,...,13007,14266,14860,14973,15422,15678,16064,16347,1091,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,6,Northern League Div. Two,5,1,11,1,Easington Colliery,,,2016,...,,3255,,3582,,3728,3803,3867,221,0
936,6,Northern League Div. Two,3,1,9,1,Boldon Community Association,,,,...,,1892,,2382,,2512,2588,2649,206,0
937,6,Northern Counties East League Div. One,18,1,19,1,Swallownest,,,1407,...,,393,,833,,1089,1396,1487,563,0
938,6,Northern League Div. Two,11,1,15,1,Newcastle University,,,1392,...,,1775,,1898,,0,2018,2039,120,0


In [20]:
#Turn our features and targets into int64's.
football["Gained between 19/4/23 & 29/6/22"] = football["Gained between 19/4/23 & 29/6/22"].apply(pd.to_numeric)
football["2022-23 Promoted"] = football["2022-23 Promoted"].apply(pd.to_numeric)
#Define features as x and targets as y. 
x = np.array(football["Gained between 19/4/23 & 29/6/22"])
y = np.array(football["Positional Improvement"])
#Perform a train test split, 80% train and 20% test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=13)

In [22]:
pipeline = Pipeline([("transform",StandardScaler()),("classifier",SGDClassifier())])

pipeline.fit(x_train.reshape(-1,1),y_train)
pipeline.predict(x_test.reshape(-1,1))
print(pipeline.score(x_test.reshape(-1,1),y_test))
np.unique(y_test,return_counts=True)
#Now have altered the model such that the features remain as the gained twitter followers but the labels are now "positional improvement".


0.5691489361702128


(array([0, 1], dtype=int64), array([ 86, 102], dtype=int64))