# REDDIT FLAIR DETECTOR AND PREDICTOR
## Modules required to be installed and imported

In [None]:
pip install scikit-learn

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle

## Loading the CSV file and combining data for training and testing
#### Another column has been added to the CSV file, namely - COMBINED, where each row contains  the title of the flair as well as the comments of those flairs. This is mainly because after detailed EDA, I inferred that the body includes a lot of verbiage and also  if the body for every flair was used in constructing the model it would lead to lowering the accuracy and also add a lot of latency to the process. The url of the flair contained too many bad symbols and hence, would not help a lot in detection. Therefore, I omitted the URL in the training process. 

In [None]:
posts_data = pd.read_csv('Reddit_India_Data.csv')
posts_data['COMBINED']= posts_data['title'] + posts_data['comments']

# Linear SVM Model used for training and testing
#### After analysing a few other  classification models, Linear Support Vector Machine was found the best fitted model on the data. The classification report is also being displayed with respect to individal flairs. The model has been stored as  a .pkl for further use in the predictor.

In [None]:
def linear_svm(X_train, X_test, y_train, y_test):
  
  from sklearn.linear_model import SGDClassifier

  model = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])
  model.fit(X_train, y_train)
  pickle.dump(model, open('model.pkl', 'wb')) 
  
  y_pred = model.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))


# Training and Testing of data for Flair Prediction
#### The train test split is taken as 70% and 30% respectively

In [None]:
def train_test(X,y):

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)
  print("Results of Linear Support Vector Machine Classifier")
  linear_svm(X_train, X_test, y_train, y_test)

flairs = ["AskIndia", "Non-Political", "Coronavirus", "Scheduled", "Photography", "Science/Technology", "Politics", "Business/Finance", "Policy/Economy", "Sports", "Food"]

output = posts_data.flair
Combined_Input = posts_data.COMBINED
Comments_Input = posts_data.comments
Title_Input = posts_data.title
Y = posts_data.body
Z = posts_data.url


# Reddit Flair Prediction 
#### Loading the pickle file and detecting the r/India posts according to URLs. After fetching the urls, they undergo the same procedure of conversion into strings and cleaning, for text pre-processing. Prediction of these URLs according to their respective flairs take place after the urls run through the model. Prediction of 2 posts have been limited at a time due to Reddit's scraping rule of not more than 2-3 in a go. Predicted flairs are printed along with their actual flairs to authenticate the accuracy of the predictions.

In [None]:
space_symbols = re.compile('[/(){}\[\]\|@,;]')
delete_symbols = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def cleaning(text):
    text = text.lower()
    text = space_symbols.sub(' ', text)
    text = delete_symbols.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

reddit = praw.Reddit(client_id='3SYchvDH__igSg', client_secret='ZT-laFvDosSedAE_qPJ0dnkGtdA', user_agent='Scrapping Reddit_data')
loaded_model = pickle.load(open('model.pkl', 'rb'))

def detect_flair(url):
    url = str(url)
    submission = reddit.submission(url = url)
    abcd = {"title": [], "comments": []}
    abcd['title'] = submission.title
    submission.comments.replace_more(limit=None)
    comment = ''
    for top_level_comment in submission.comments:
      comment = comment + ' ' + top_level_comment.body
    abcd["comments"].append(comment)
    abcd = pd.DataFrame(abcd)
    abcd['title'] = abcd['title'].astype(str)
    abcd['title'] =abcd['title'].apply(cleaning)
    abcd['comments'] = abcd['comments'].astype(str)
    abcd['comments'] =abcd['comments'].apply(cleaning)
    abcd['com'] = abcd['title'] + abcd['comments'] 
    return loaded_model.predict(abcd['com'])[0]  

subreddit = reddit.subreddit('india')

for submission in subreddit.top(limit=2):
    print(detect_flair(submission.url))