In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

In [7]:
#Import the Non-League football CSV file containing teams, their league position, their promotion status and their twitter followers
football = pd.read_csv("football_data_final_edit.csv")
#Drop null rows which erroneously came with the CSV 
football.drop([0,1], inplace=True)
#Drop null columns which erroneously came with the CSV
football.drop(["Unnamed: 25","Unnamed: 26","Unnamed: 27","Unnamed: 28"],inplace=True,axis=1)
#Since we dropped some rows, we need to reset the index of the dataframe
football = football.reset_index(drop=True)
#The column headings came from the CSV as the first row, redefine the headings correctly and strip any whitespace with lambda func
strip_whitespace = lambda x: x.strip() 
column_headings = list(map(strip_whitespace,list(football.iloc[0])))
football.columns = column_headings
#Remove the now un-needed dupiclate row containing the column headings and reset the index again.
football.drop(football[football["Step"]=="0"].index,inplace=True)
football.drop([0],inplace=True)
football = football.reset_index(drop=True)
#Let's trim any whitespace from entries and column headings
football =football.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
football.head()

Unnamed: 0,Step,League,2021-22\nPosition,2021-22 Promoted,2022-23\nPosition,2022-23 Promoted,Teams,06.12.18,17.01.19,26.03.19,...,03.02.21,04.04.21,22.01.22,27.05.22,29.06.22,06.10.22,18.12.22,19.04.23,15.07.23,Gained between 19/4/23 & 29/6/22
0,2,National League South,1,2,4,2,Aveley,7508.0,8332.0,8529,...,10490,10512,11122,11511,11600,11830.0,11950,12261,12812,661
1,3,Isthmian Premier League,2,2,1,2,Chatham Town,,,4575,...,6350,6475,7476,7998,8153,,8761,9247,9541,1094
2,3,Northern Premier Division,1,2,1,2,Macclesfield Town,,,23658,...,27832,28617,38189,40507,40923,,42850,44038,44193,3115
3,3,Southern Premier Div. Central,1,2,4,2,Long Eaton United,,,4419,...,5524,5666,6229,6591,6648,,7022,7233,7609,585
4,3,Southern Premier Div. South,2,2,2,2,Walton & Hersham,,,4369,...,5407,5621,7110,8197,8330,,10032,10948,12746,2618


In [19]:
#Drop rows which are step 0 as these aren't non-league teams.
football = football.sort_values(by=["Step"])
#Sort values in dataframe by the step, then drop NaN values from league and step as these are not compatible with later analysis
football.dropna(subset =["Step","League","Gained between 19/4/23 & 29/6/22"],how="any",inplace=True)
#Reset the index based on the sorted table
football = football.reset_index(drop=True)
#Remove commas from dataframe values
football = football.replace(",","",regex=True)
#print table
football

Unnamed: 0,Step,League,2021-22\nPosition,2021-22 Promoted,2022-23\nPosition,2022-23 Promoted,Teams,06.12.18,17.01.19,26.03.19,...,03.02.21,04.04.21,22.01.22,27.05.22,29.06.22,06.10.22,18.12.22,19.04.23,15.07.23,Gained between 19/4/23 & 29/6/22
0,1,National League,15,1,4,1,Woking,16328,16948,17471,...,20791,21065,22526,23403,23589,24308,24970,26117,26546,2528
1,1,National League,23,0,12,1,Oldham Athletic,55347,,60079,...,,,70581,73678,74230,79166,80837,82919,84103,8689
2,1,National League,8,1,10,1,Dagenham & Redbridge,34072,34312,34498,...,36735,36958,38607,39704,39836,40554,41001,41894,42060,2058
3,1,National League,7,1,3,1,Chesterfield,76096,76699,77155,...,79097,79373,88210,93297,93808,95478,96412,98105,99776,4297
4,1,National League,3,1,15,1,Solihull Moors,16091,16675,17573,...,22556,22762,25115,26291,28292,29491,30065,30778,31102,2486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,6,North West Counties League Div. One North,1,2,3,1,FC St Helens,,,,...,,,,,3106,,3308,3556,3652,450
936,6,Wessex League Div. One,4,1,11,1,Folland Sports,,,854,...,1350,,2709,,2890,,2993,3049,3081,159
937,6,North West Counties League Div. One North,3,2,5,1,Euxton Villa,,,,...,,,,,3254,,3467,3668,3753,414
938,6,Eastern Counties League Div. One South,2,2,2,1,Basildon Town,,,,...,,,,,2208,,2474,2696,2919,488


In [115]:
#Turn our features and targets into int64's.
football["Gained between 19/4/23 & 29/6/22"] = football["Gained between 19/4/23 & 29/6/22"].apply(pd.to_numeric)
football["2022-23 Promoted"] = football["2022-23 Promoted"].apply(pd.to_numeric)
#Define features as x and targets as y. 
x = np.array(football["Gained between 19/4/23 & 29/6/22"])
y = np.array(football["2022-23 Promoted"])
#Perform a train test split, 80% train and 20% test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=13)

In [122]:
pipeline = Pipeline([("transform",StandardScaler()),("classifier",SGDClassifier())])

pipeline.fit(x_train.reshape(-1,1),y_train)
pipeline.predict(x_test.reshape(-1,1))
pipeline.score(x_test.reshape(-1,1),y_test)
np.unique(y_test,return_counts=True)
#Doing this method with club promotions as the classification label and the followers gained as the feature predicts that all clubs will remain in their league as promotions/demotions are comparitively rare so the model scores highly despite predicting every club as in the same league.

(array([0, 1, 2], dtype=int64), array([ 17, 153,  18], dtype=int64))